# load packages
library(rvest)
library(htmltab)
library(stringr)
library(dplyr)

1.Using the 173 majors listed in fivethirtyeight.com’s College Majors dataset [https://fivethirtyeight.com/features/the-economic-guide-to-picking-a-college-major/], provide code that identifies the majors that contain either “DATA” or “STATISTICS”

# store file URL
file <- "https://projects.fivethirtyeight.com/mid-levels/college-majors/index.html"
data <- read_html(file)

# read the table from the data
allTables <- data %>%
  html_table(fill=TRUE) 

# select the first table as a tibble
table1 <- allTables[[1]]
# create a vector of the majors
majors <- collect(select(table1, MAJOR))[[1]]
# detect matches
matches <- str_detect(majors, "(?i)statistics|(?i)data")
# subset majors
majors[matches]
## [1] "Mgmt. Information Systems & Statistics"
## [2] "Statistics & Decision Science"         
## [3] "Computer Programming & Data Processing"

2. Write code that transforms the data below:

[1] “bell pepper” “bilberry” “blackberry” “blood orange”
[5] “blueberry” “cantaloupe” “chili pepper” “cloudberry”
[9] “elderberry” “lime” “lychee” “mulberry”
[13] “olive” “salal berry”

Into a format like this:
c(“bell pepper”, “bilberry”, “blackberry”, “blood orange”, “blueberry”, “cantaloupe”, “chili pepper”, “cloudberry”, “elderberry”, “lime”, “lychee”, “mulberry”, “olive”, “salal berry”)

text <- '[1] "bell pepper"  "bilberry"     "blackberry"   "blood orange"
[5] "blueberry"    "cantaloupe"   "chili pepper" "cloudberry"  
[9] "elderberry"   "lime"         "lychee"       "mulberry"    
[13] "olive"        "salal berry"'

# stringr extract_all
words <- str_extract_all(text, regex("\\w+..\\w+"))
# simplify list to a character vector
unlist(words)
##  [1] "bell pepper"  "bilberry"     "blackberry"   "blood orange" "blueberry"   
##  [6] "cantaloupe"   "chili pepper" "cloudberry"   "elderberry"   "lime"        
## [11] "lychee"       "mulberry"     "olive"        "salal berry"

3. Describe, in words, what these expressions will match:

4. Construct regular expressions to match words that:

same <- c("yay", "bcb", "dad", "roger", "maaaaan", "maaaaaaaaaam")
str_view(same, "(.).+\\1$", match=TRUE)
church <- c("church", "barn", "outhouse", "papa", "eleven")
str_view(church, "(..).+\\1", match=TRUE)
str_view(church, "([a-zA-Z]).\\1.+\\1.+", match=TRUE)