library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
theLink <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv"

# load data into data frame
df_majors <- read.csv(file=theLink, header = TRUE, sep = ",")

head(df_majors)
##   FOD1P                                 Major
## 1  1100                   GENERAL AGRICULTURE
## 2  1101 AGRICULTURE PRODUCTION AND MANAGEMENT
## 3  1102                AGRICULTURAL ECONOMICS
## 4  1103                       ANIMAL SCIENCES
## 5  1104                          FOOD SCIENCE
## 6  1105            PLANT SCIENCE AND AGRONOMY
##                    Major_Category
## 1 Agriculture & Natural Resources
## 2 Agriculture & Natural Resources
## 3 Agriculture & Natural Resources
## 4 Agriculture & Natural Resources
## 5 Agriculture & Natural Resources
## 6 Agriculture & Natural Resources
df_majors %>% 
  filter(str_detect(Major, "DATA|STATISTICS"))
##   FOD1P                                         Major
## 1  6212 MANAGEMENT INFORMATION SYSTEMS AND STATISTICS
## 2  2101      COMPUTER PROGRAMMING AND DATA PROCESSING
## 3  3702               STATISTICS AND DECISION SCIENCE
##            Major_Category
## 1                Business
## 2 Computers & Mathematics
## 3 Computers & Mathematics

Problem 2

given_str <- '[1] "bell pepper"  "bilberry"     "blackberry"   "blood orange"
[5] "blueberry"    "cantaloupe"   "chili pepper" "cloudberry"  
[9] "elderberry"   "lime"         "lychee"       "mulberry"    
[13] "olive"        "salal berry"'

given_str
## [1] "[1] \"bell pepper\"  \"bilberry\"     \"blackberry\"   \"blood orange\"\n[5] \"blueberry\"    \"cantaloupe\"   \"chili pepper\" \"cloudberry\"  \n[9] \"elderberry\"   \"lime\"         \"lychee\"       \"mulberry\"    \n[13] \"olive\"        \"salal berry\""
# matches [a-z]+ for 1+ times
extracted_str <- str_extract_all(given_str, '[a-z]+\\s[a-z]+|[a-z]+')
extracted_str
## [[1]]
##  [1] "bell pepper"  "bilberry"     "blackberry"   "blood orange"
##  [5] "blueberry"    "cantaloupe"   "chili pepper" "cloudberry"  
##  [9] "elderberry"   "lime"         "lychee"       "mulberry"    
## [13] "olive"        "salal berry"
gsub('[\"]', '', extracted_str)
## [1] "c(bell pepper, bilberry, blackberry, blood orange, blueberry, cantaloupe, chili pepper, cloudberry, elderberry, lime, lychee, mulberry, olive, salal berry)"

The two exercises below are taken from R for Data Science, 14.3.5.1 in the on-line version:

#3 Describe, in words, what these expressions will match:

(.)\1\1

(.)(.)\2\1

(..)\1

(.).\1.\1

(.)(.)(.).*\3\2\1

#4 Construct regular expressions to match words that:

Start and end with the same character.

str.4.1 <- c("ZOO","YAML","SAS","TOY","MOM")  
pattern.4.1 <- "^(.).*\\1$"
str.4.1 %>% 
  str_subset(pattern.4.1)
## [1] "SAS" "MOM"

Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.)

str.4.2 <- c("church","papa","apple")  
pattern.4.2 <- "(.)(.).*\\1"
str.4.2 %>% 
  str_subset(pattern.4.2)
## [1] "church" "papa"

Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)

str.4.3 <- c("eleven","kjkjkjkj","apple")  
pattern.4.3 <- "(.).\\1.\\1"
str.4.3 %>% 
  str_subset(pattern.4.3)
## [1] "eleven"   "kjkjkjkj"