# load packages
library(rvest)
library(htmltab)
library(stringr)
library(dplyr)
–
# store file URL
file <- "https://projects.fivethirtyeight.com/mid-levels/college-majors/index.html"
data <- read_html(file)
# read the table from the data
allTables <- data %>%
html_table(fill=TRUE)
# select the first table as a tibble
table1 <- allTables[[1]]
# create a vector of the majors
majors <- collect(select(table1, MAJOR))[[1]]
# detect matches
matches <- str_detect(majors, "(?i)statistics|(?i)data")
# subset majors
majors[matches]
## [1] "Mgmt. Information Systems & Statistics"
## [2] "Statistics & Decision Science"
## [3] "Computer Programming & Data Processing"
–
[1] “bell pepper” “bilberry” “blackberry” “blood orange”
[5] “blueberry” “cantaloupe” “chili pepper” “cloudberry”
[9] “elderberry” “lime” “lychee” “mulberry”
[13] “olive” “salal berry”
Into a format like this:
c(“bell pepper”, “bilberry”, “blackberry”, “blood orange”, “blueberry”,
“cantaloupe”, “chili pepper”, “cloudberry”, “elderberry”, “lime”,
“lychee”, “mulberry”, “olive”, “salal berry”)
text <- '[1] "bell pepper" "bilberry" "blackberry" "blood orange"
[5] "blueberry" "cantaloupe" "chili pepper" "cloudberry"
[9] "elderberry" "lime" "lychee" "mulberry"
[13] "olive" "salal berry"'
# stringr extract_all
words <- str_extract_all(text, regex("\\w+..\\w+"))
# simplify list to a character vector
unlist(words)
## [1] "bell pepper" "bilberry" "blackberry" "blood orange" "blueberry"
## [6] "cantaloupe" "chili pepper" "cloudberry" "elderberry" "lime"
## [11] "lychee" "mulberry" "olive" "salal berry"
–
(.)\1\1
anything appearing three times in a row
“(.)(.)\2\1”
two things followed by those same two things in
reverse
(..)\1
any two things repeated once
“(.).\1.\1”
anything followed by anything followed by the first thing followed
by anything followed by the first thing again
“(.)(.)(.).\3\2\1”
any three things followed by 0 or more characters of anything
followed by the first three things in reverse*
–
same <- c("yay", "bcb", "dad", "roger", "maaaaan", "maaaaaaaaaam")
str_view(same, "(.).+\\1$", match=TRUE)
church <- c("church", "barn", "outhouse", "papa", "eleven")
str_view(church, "(..).+\\1", match=TRUE)
str_view(church, "([a-zA-Z]).\\1.+\\1.+", match=TRUE)