knitr::opts_chunk$set(echo = TRUE,
                      message=FALSE, 
                      warning=FALSE)

library(tidyverse)  # the usual
library(rvest)      # for scraping
library(janitor)    # the usual

Use regex to find phrases and patterns for coding

Some of you have had cases where there were phrases you wanted to find. You can use regex and str_detect() to find them , which returns a yes-no answer. If it finds the pattern, then you can do something. If it doesn’t, you can do something else.

Here’s an example using a tiny slice of visa applications from the Labor Department. (It’s a “tab-separated” rather than “comma-separated” line, so we have to use read_tsv instead of read_csv. ).

Here are a couple of things you might not know already:

  1. The col_types = cols(…) means that you can force any columns to be a specific type. In this case, we want zipcode to be “character”, or “c” type. Everything else it will guess.

  2. We don’t want to deal with upper and lower cased entries, so this turns everythng that is a character variable into its upper case representation.

mydata <- read_tsv("https://cronkitedata.github.io/cronkite-docs/assets/data/special/regex_h2bvisas.txt", 
                   col_types= cols(zipcode="c")) %>%
  mutate (across ( is.character, toupper))

Let’s take a look at what job types are being requested

mydata %>% count(job_title) 
job_title n
AG EQUIPMENT OPERATOR 1
AGRICULTURAL EQUIPMENT OPERATOR 1
AGRICULTURAL LABORERS/WORKERS 1
BEEKEEPER 4
COOKERS 1
EQUIPMENT OPERATOR 1
FARM LABOR 2
FARM MACHINE OPERATOR 3
FARM WORKER 2
FARM WORKER - GRAIN 1 1
FARM WORKER AND LABORER 4
FARM WORKER AND LABORERS 1
FARM WORKER; CROP 1
FARM WORKER/LABORER 1
FARM WORKERS/LABORER 2
FARM/IRRIGATION/LIVESTOCK WORKER 1
FARMWORKER 6
FARMWORKER: DIVERSIFIED 3
FARMWORKER*SEE ATTACHMENT FOR FULL TITLE 2
FARMWORKERS 1
FARMWORKERS & LABORERS, CITRUS 1
FARMWORKERS & LABORERS, CROP 5
FARMWORKERS AND LABORERS, CROP 5
FARMWORKERS, LABORERS, CROP NURSERY 1
FARMWORKERS, LABORERS, CROP, NURSERY, & GREENHOUSE 2
FIELD WORKER (BROCCOLI HARVEST) 1
FIELD WORKER (CELERY HARVESTER ORGANIC & CONVENTIO 1
FIELD WORKER (DATES) 1
FIELD WORKER (HARVEST) 2
FIELD WORKER (HEAD LETTUCE HARVEST) 1
FIELD WORKER (IRRIGATORS) 1
FIELD WORKER (MELON HARVEST, GENERAL LABOR) 1
FIELD WORKER (MIX ROMAINE AND ROMAINE HEARTS) 1
FIELD WORKERS 7
FIELD WORKERS (ICEBERG AND ROMAINE LETTUCE) 1
FIELD WORKERS (ICEBERG LETTUCE) 1
FIELD WORKERS (MIX ROMAINE AND ROMAINE HEARTS) 1
FIELD WORKERS: CITRUS, PEACHES, DATES 1
FIELD WORKERS: HARVEST/FIELD LABOR CROPS 1
GENERAL FARMWORKER 4
GENERAL FARMWORKERS 11
LETTUCE HARVESTER 1
LIVESTOCK/FARM/IRRIGATION WORKER & TRUCK DRIVER 1
NURSERY WORKER 1
NURSERY WORKERS 1
RANGE LIVESTOCK WORKER 2
SALAD PROCESSING AND FARM LABOR 1
SHEEPHERDER 16
VEGETABLE HARVEST WORKERS 1

There are all kinds of ways of saying “FARMWORKER” in this . Here’s a simple way to deal with it - to get a list of all farm labor, field workers and farm workers vs. other jobs. :

mydata %>%
  mutate ( new_title = 
             if_else (str_detect ( job_title,  "(FIELD|FARM).?(WORK|LABOR)" ) , 
                      "FARMWORKER", "OTHER")) %>%
  count ( new_title, job_title)
new_title job_title n
FARMWORKER FARM LABOR 2
FARMWORKER FARM WORKER 2
FARMWORKER FARM WORKER - GRAIN 1 1
FARMWORKER FARM WORKER AND LABORER 4
FARMWORKER FARM WORKER AND LABORERS 1
FARMWORKER FARM WORKER; CROP 1
FARMWORKER FARM WORKER/LABORER 1
FARMWORKER FARM WORKERS/LABORER 2
FARMWORKER FARMWORKER 6
FARMWORKER FARMWORKER: DIVERSIFIED 3
FARMWORKER FARMWORKER*SEE ATTACHMENT FOR FULL TITLE 2
FARMWORKER FARMWORKERS 1
FARMWORKER FARMWORKERS & LABORERS, CITRUS 1
FARMWORKER FARMWORKERS & LABORERS, CROP 5
FARMWORKER FARMWORKERS AND LABORERS, CROP 5
FARMWORKER FARMWORKERS, LABORERS, CROP NURSERY 1
FARMWORKER FARMWORKERS, LABORERS, CROP, NURSERY, & GREENHOUSE 2
FARMWORKER FIELD WORKER (BROCCOLI HARVEST) 1
FARMWORKER FIELD WORKER (CELERY HARVESTER ORGANIC & CONVENTIO 1
FARMWORKER FIELD WORKER (DATES) 1
FARMWORKER FIELD WORKER (HARVEST) 2
FARMWORKER FIELD WORKER (HEAD LETTUCE HARVEST) 1
FARMWORKER FIELD WORKER (IRRIGATORS) 1
FARMWORKER FIELD WORKER (MELON HARVEST, GENERAL LABOR) 1
FARMWORKER FIELD WORKER (MIX ROMAINE AND ROMAINE HEARTS) 1
FARMWORKER FIELD WORKERS 7
FARMWORKER FIELD WORKERS (ICEBERG AND ROMAINE LETTUCE) 1
FARMWORKER FIELD WORKERS (ICEBERG LETTUCE) 1
FARMWORKER FIELD WORKERS (MIX ROMAINE AND ROMAINE HEARTS) 1
FARMWORKER FIELD WORKERS: CITRUS, PEACHES, DATES 1
FARMWORKER FIELD WORKERS: HARVEST/FIELD LABOR CROPS 1
FARMWORKER GENERAL FARMWORKER 4
FARMWORKER GENERAL FARMWORKERS 11
FARMWORKER SALAD PROCESSING AND FARM LABOR 1
OTHER AG EQUIPMENT OPERATOR 1
OTHER AGRICULTURAL EQUIPMENT OPERATOR 1
OTHER AGRICULTURAL LABORERS/WORKERS 1
OTHER BEEKEEPER 4
OTHER COOKERS 1
OTHER EQUIPMENT OPERATOR 1
OTHER FARM MACHINE OPERATOR 3
OTHER FARM/IRRIGATION/LIVESTOCK WORKER 1
OTHER LETTUCE HARVESTER 1
OTHER LIVESTOCK/FARM/IRRIGATION WORKER & TRUCK DRIVER 1
OTHER NURSERY WORKER 1
OTHER NURSERY WORKERS 1
OTHER RANGE LIVESTOCK WORKER 2
OTHER SHEEPHERDER 16
OTHER VEGETABLE HARVEST WORKERS 1

Download all CSV files from a web page

The first step is to read the HTML page, and, using XPATH, extract out the “a” tags’ “href” attributes. Then we look for the pattern, .*csv$ .

base_page <- "https://data.sba.gov/dataset/ppp-foia"



pattern <- ".*csv$"

#get the base page read in rvest
my_page <- read_html(base_page)

#extract from them the attribute 'html' from the element 'a'
links <- html_nodes(my_page, "a") %>% 
  html_attr("href") %>%
  str_subset( ., pattern) 

links
##  [1] "https://data.sba.gov/dataset/8aa276e2-6cab-4f86-aca4-a7dde42adf24/resource/be89502c-1961-4a79-a44c-979eb3f411a8/download/public_150k_plus_210630.csv"    
##  [2] "https://data.sba.gov/dataset/8aa276e2-6cab-4f86-aca4-a7dde42adf24/resource/e796a768-4785-46c1-98a7-8bd3a1ef0ab9/download/public_up_to_150k_1_210630.csv" 
##  [3] "https://data.sba.gov/dataset/8aa276e2-6cab-4f86-aca4-a7dde42adf24/resource/962261d5-bd48-469f-8977-4f516e686e45/download/public_up_to_150k_2_210630.csv" 
##  [4] "https://data.sba.gov/dataset/8aa276e2-6cab-4f86-aca4-a7dde42adf24/resource/3335ed9e-6097-46f5-b928-d3ee1ce32520/download/public_up_to_150k_3_210630.csv" 
##  [5] "https://data.sba.gov/dataset/8aa276e2-6cab-4f86-aca4-a7dde42adf24/resource/065378d0-73c5-4730-b1b2-b421c5c44c73/download/public_up_to_150k_4_210630.csv" 
##  [6] "https://data.sba.gov/dataset/8aa276e2-6cab-4f86-aca4-a7dde42adf24/resource/02417f74-713e-4b9b-b831-e3587ad7a2a7/download/public_up_to_150k_5_210630.csv" 
##  [7] "https://data.sba.gov/dataset/8aa276e2-6cab-4f86-aca4-a7dde42adf24/resource/d54eff57-3417-429f-a161-55b497fd2777/download/public_up_to_150k_6_210630.csv" 
##  [8] "https://data.sba.gov/dataset/8aa276e2-6cab-4f86-aca4-a7dde42adf24/resource/94d910a4-54fc-4c54-94b2-ed0bce001366/download/public_up_to_150k_7_210630.csv" 
##  [9] "https://data.sba.gov/dataset/8aa276e2-6cab-4f86-aca4-a7dde42adf24/resource/fc07af7a-6aee-413e-92f9-2abd1145ff25/download/public_up_to_150k_8_210630.csv" 
## [10] "https://data.sba.gov/dataset/8aa276e2-6cab-4f86-aca4-a7dde42adf24/resource/64be68f5-4d4a-409d-91e9-3fb3fc1e603d/download/public_up_to_150k_9_210630.csv" 
## [11] "https://data.sba.gov/dataset/8aa276e2-6cab-4f86-aca4-a7dde42adf24/resource/2efb5391-e77f-467b-9b1c-ea4e9690e94f/download/public_up_to_150k_10_210630.csv"
## [12] "https://data.sba.gov/dataset/8aa276e2-6cab-4f86-aca4-a7dde42adf24/resource/1935d2f9-f1c8-4b1b-99a7-f8cf9cd399bc/download/public_up_to_150k_11_210630.csv"
## [13] "https://data.sba.gov/dataset/8aa276e2-6cab-4f86-aca4-a7dde42adf24/resource/a5a4affd-de13-44ff-b0ec-2b2b30902ae3/download/public_up_to_150k_12_210630.csv"
LS0tCnRpdGxlOiAiUmVnZXggYW5kIHNjcmFwaW5nIGV4YW1wbGVzIGluIFIiCmF1dGhvcjogIlNhcmFoIENvaGVuIgpkYXRlOiAiMTAvMjQvMjAyMSIKb3V0cHV0OiAKICAgaHRtbF9kb2N1bWVudDoKICAgICAgdGhlbWU6IGpvdXJuYWwKICAgICAgZGZfcHJpbnQ6IGthYmxlCiAgICAgIHRvYzogdHJ1ZQogICAgICB0b2NfZmxvYXQ6IHRydWUKICAgICAgY29kZV9kb3dubG9hZDogdHJ1ZQogICAgICBjb2RlX2ZvbGRpbmc6IHNob3cKLS0tCgoKYGBge3Igc2V0dXAsIGluY2x1ZGU9VFJVRSwgbWVzc2FnZT1GQUxTRSwgZWNobz1UUlVFfQoKa25pdHI6Om9wdHNfY2h1bmskc2V0KGVjaG8gPSBUUlVFLAogICAgICAgICAgICAgICAgICAgICAgbWVzc2FnZT1GQUxTRSwgCiAgICAgICAgICAgICAgICAgICAgICB3YXJuaW5nPUZBTFNFKQoKbGlicmFyeSh0aWR5dmVyc2UpICAjIHRoZSB1c3VhbApsaWJyYXJ5KHJ2ZXN0KSAgICAgICMgZm9yIHNjcmFwaW5nCmxpYnJhcnkoamFuaXRvcikgICAgIyB0aGUgdXN1YWwKCgoKYGBgCgoKIyMgVXNlIHJlZ2V4IHRvIGZpbmQgcGhyYXNlcyBhbmQgcGF0dGVybnMgZm9yIGNvZGluZwoKClNvbWUgb2YgeW91IGhhdmUgaGFkIGNhc2VzIHdoZXJlIHRoZXJlIHdlcmUgcGhyYXNlcyB5b3Ugd2FudGVkIHRvIGZpbmQuIFlvdSBjYW4gdXNlIHJlZ2V4IGFuZCBzdHJfZGV0ZWN0KCkgdG8gZmluZCB0aGVtICwgd2hpY2ggcmV0dXJucyBhIHllcy1ubyBhbnN3ZXIuIElmIGl0IGZpbmRzIHRoZSBwYXR0ZXJuLCB0aGVuIHlvdSBjYW4gZG8gc29tZXRoaW5nLiBJZiBpdCBkb2Vzbid0LCB5b3UgY2FuIGRvIHNvbWV0aGluZyBlbHNlLiAKCkhlcmUncyBhbiBleGFtcGxlIHVzaW5nIGEgdGlueSBzbGljZSBvZiB2aXNhIGFwcGxpY2F0aW9ucyBmcm9tIHRoZSBMYWJvciBEZXBhcnRtZW50LiAoSXQncyBhICJ0YWItc2VwYXJhdGVkIiByYXRoZXIgdGhhbiAiY29tbWEtc2VwYXJhdGVkIiBsaW5lLCBzbyB3ZSBoYXZlIHRvIHVzZSByZWFkX3RzdiBpbnN0ZWFkIG9mIHJlYWRfY3N2LiApLiAKCkhlcmUgYXJlIGEgY291cGxlIG9mIHRoaW5ncyB5b3UgbWlnaHQgbm90IGtub3cgYWxyZWFkeTogCgoxLiBUaGUgY29sX3R5cGVzID0gY29scyguLi4pIG1lYW5zIHRoYXQgeW91IGNhbiBmb3JjZSBhbnkgY29sdW1ucyB0byBiZSBhIHNwZWNpZmljIHR5cGUuIEluIHRoaXMgY2FzZSwgd2Ugd2FudCB6aXBjb2RlIHRvIGJlICJjaGFyYWN0ZXIiLCBvciAiYyIgdHlwZS4gRXZlcnl0aGluZyBlbHNlIGl0IHdpbGwgZ3Vlc3MuIAoKMi4gV2UgZG9uJ3Qgd2FudCB0byBkZWFsIHdpdGggdXBwZXIgYW5kIGxvd2VyIGNhc2VkIGVudHJpZXMsIHNvIHRoaXMgdHVybnMgZXZlcnl0aG5nIHRoYXQgaXMgYSBjaGFyYWN0ZXIgdmFyaWFibGUgaW50byBpdHMgdXBwZXIgY2FzZSByZXByZXNlbnRhdGlvbi4gCgpgYGB7ciBnZXRkYXRhfQoKbXlkYXRhIDwtIHJlYWRfdHN2KCJodHRwczovL2Nyb25raXRlZGF0YS5naXRodWIuaW8vY3JvbmtpdGUtZG9jcy9hc3NldHMvZGF0YS9zcGVjaWFsL3JlZ2V4X2gyYnZpc2FzLnR4dCIsIAogICAgICAgICAgICAgICAgICAgY29sX3R5cGVzPSBjb2xzKHppcGNvZGU9ImMiKSkgJT4lCiAgbXV0YXRlIChhY3Jvc3MgKCBpcy5jaGFyYWN0ZXIsIHRvdXBwZXIpKQoKCmBgYAoKTGV0J3MgdGFrZSBhIGxvb2sgYXQgd2hhdCBqb2IgdHlwZXMgYXJlIGJlaW5nIHJlcXVlc3RlZAoKYGBge3IgcHJpbnRkYXRhLCByb3dzLnByaW50PTUwfQoKbXlkYXRhICU+JSBjb3VudChqb2JfdGl0bGUpIAogIAoKYGBgCgoKVGhlcmUgYXJlIGFsbCBraW5kcyBvZiB3YXlzIG9mIHNheWluZyAiRkFSTVdPUktFUiIgaW4gdGhpcyAuIEhlcmUncyBhIHNpbXBsZSB3YXkgdG8gZGVhbCB3aXRoIGl0IC0gdG8gZ2V0IGEgbGlzdCBvZiBhbGwgZmFybSBsYWJvciwgZmllbGQgd29ya2VycyBhbmQgZmFybSB3b3JrZXJzIHZzLiBvdGhlciBqb2JzLiA6CgpgYGB7ciByZWdleCwgcm93cy5wcmludD01MH0KCm15ZGF0YSAlPiUKICBtdXRhdGUgKCBuZXdfdGl0bGUgPSAKICAgICAgICAgICAgIGlmX2Vsc2UgKHN0cl9kZXRlY3QgKCBqb2JfdGl0bGUsICAiKEZJRUxEfEZBUk0pLj8oV09SS3xMQUJPUikiICkgLCAKICAgICAgICAgICAgICAgICAgICAgICJGQVJNV09SS0VSIiwgIk9USEVSIikpICU+JQogIGNvdW50ICggbmV3X3RpdGxlLCBqb2JfdGl0bGUpCgoKCmBgYAoKCgoKIyMgRG93bmxvYWQgYWxsIENTViBmaWxlcyBmcm9tIGEgd2ViIHBhZ2UKCgpUaGUgZmlyc3Qgc3RlcCBpcyB0byByZWFkIHRoZSBIVE1MIHBhZ2UsIGFuZCwgdXNpbmcgWFBBVEgsIGV4dHJhY3Qgb3V0IHRoZSAiYSIgdGFncycgImhyZWYiIGF0dHJpYnV0ZXMuIFRoZW4gd2UgbG9vayBmb3IgdGhlIHBhdHRlcm4sIGAuKmNzdiRgIC4gIAoKCmBgYHtyIGVjaG89VFJVRX0KCmJhc2VfcGFnZSA8LSAiaHR0cHM6Ly9kYXRhLnNiYS5nb3YvZGF0YXNldC9wcHAtZm9pYSIKCgoKcGF0dGVybiA8LSAiLipjc3YkIgoKI2dldCB0aGUgYmFzZSBwYWdlIHJlYWQgaW4gcnZlc3QKbXlfcGFnZSA8LSByZWFkX2h0bWwoYmFzZV9wYWdlKQoKI2V4dHJhY3QgZnJvbSB0aGVtIHRoZSBhdHRyaWJ1dGUgJ2h0bWwnIGZyb20gdGhlIGVsZW1lbnQgJ2EnCmxpbmtzIDwtIGh0bWxfbm9kZXMobXlfcGFnZSwgImEiKSAlPiUgCiAgaHRtbF9hdHRyKCJocmVmIikgJT4lCiAgc3RyX3N1YnNldCggLiwgcGF0dGVybikgCgpsaW5rcwoKYGBgCgoK