library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   1.0.1 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

Exercise 1

Using the 173 majors listed in fivethirtyeight.com’s College Majors dataset [https://fivethirtyeight.com/features/the-economic-guide-to-picking-a-college-major/], provide code that identifies the majors that contain either “DATA” or “STATISTICS”

Solution

data <- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv")
head(data)
##   FOD1P                                 Major                  Major_Category
## 1  1100                   GENERAL AGRICULTURE Agriculture & Natural Resources
## 2  1101 AGRICULTURE PRODUCTION AND MANAGEMENT Agriculture & Natural Resources
## 3  1102                AGRICULTURAL ECONOMICS Agriculture & Natural Resources
## 4  1103                       ANIMAL SCIENCES Agriculture & Natural Resources
## 5  1104                          FOOD SCIENCE Agriculture & Natural Resources
## 6  1105            PLANT SCIENCE AND AGRONOMY Agriculture & Natural Resources
# creating a subset of majors containing the desired key words
subset_data <- data %>% 
    filter(grepl("DATA", Major) | grepl("STATISTICS", Major))
head(subset_data)
##   FOD1P                                         Major          Major_Category
## 1  6212 MANAGEMENT INFORMATION SYSTEMS AND STATISTICS                Business
## 2  2101      COMPUTER PROGRAMMING AND DATA PROCESSING Computers & Mathematics
## 3  3702               STATISTICS AND DECISION SCIENCE Computers & Mathematics

Exercise 2

Write code that transforms the data below: [1] “bell pepper” “bilberry” “blackberry” “blood orange” [5] “blueberry” “cantaloupe” “chili pepper” “cloudberry”
[9] “elderberry” “lime” “lychee” “mulberry”
[13] “olive” “salal berry” Into a format like this: c(“bell pepper”, “bilberry”, “blackberry”, “blood orange”, “blueberry”, “cantaloupe”, “chili pepper”, “cloudberry”, “elderberry”, “lime”, “lychee”, “mulberry”, “olive”, “salal berry”)

Solution

strStart <- '[1] "bell pepper" "bilberry"   "blackberry"  "blood orange"[5] "blueberry"  "cantaloupe"  "chili pepper" "cloudberry"[9] "elderberry"  "lime"     "lychee" "mulberry"[13] "olive"    "salal berry"'
create_fruits_vector <- function(string) {
    # using stringr package to get words inside double quotes.
    # reference regex from textbook 14.3.2
    fruits <- str_extract_all(string, "\"[^\"]+\"") # matches sequence of characters that start and end with double quotes.  
    # convert list to vector using unlist() 
    fruits <- unlist(fruits)
    # replace double quotes with empty string
    fruits <- str_remove_all(fruits, "\"")
    # create a character vector for fruits
    fruit_vector <- c(fruits)
    # str_flatten reduces the character vector to a single 
    return(str_c("c('", str_flatten(fruit_vector, collapse = "', '"), "')"))
}
# calling the function create_fruits_vector
create_fruits_vector(strStart)
## [1] "c('bell pepper', 'bilberry', 'blackberry', 'blood orange', 'blueberry', 'cantaloupe', 'chili pepper', 'cloudberry', 'elderberry', 'lime', 'lychee', 'mulberry', 'olive', 'salal berry')"

Exercise 3

Describe, in words, what these expressions will match:

'(.)\1\1'
## [1] "(.)\001\001"

‘(.)’ captures the first of any character and stores it in a capturing group. ‘\1’ matches the first character in the capturing group. So, this expression will match a string that contains any sequence of the same character three times. So, suppose we have string ‘ballooon’. It would match ‘ooo’,

"(.)(.)\\2\\1"
## [1] "(.)(.)\\2\\1"

Captures any the first two characters followed by the same two characters but in reversed order. So, a string ‘abcddc’ would match ‘cddc’.

"(..)\1"
## [1] "(..)\001"

Matches any string that contains a pair of characters that is repeated. So, suppose we have string ‘coconut’ it match ‘coco’.

"(.).\\1.\\1"
## [1] "(.).\\1.\\1"

Matches any string that captures the first character followed by any character and repeats the same character captured and followed by any character and repeats the same character captured. So, some string ‘apanads’ would match ‘apana’.

"(.)(.)(.).*\\3\\2\\1"
## [1] "(.)(.)(.).*\\3\\2\\1"

Matches any string that captures three characters follow by zero or more characters then repeats the third character in the captured group, then the second and the first in the group. Suppose we have a string ‘paragraph’ it would match ‘par..rap.’

Exercise 4

Construct regular expressions to match words that: -Start and end with the same character. -Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.) -Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)

Solution

# Start and end with the same character.
str_subset(words, '^(.).*\\1$')
##  [1] "america"    "area"       "dad"        "dead"       "depend"    
##  [6] "educate"    "else"       "encourage"  "engine"     "europe"    
## [11] "evidence"   "example"    "excuse"     "exercise"   "expense"   
## [16] "experience" "eye"        "health"     "high"       "knock"     
## [21] "level"      "local"      "nation"     "non"        "rather"    
## [26] "refer"      "remember"   "serious"    "stairs"     "test"      
## [31] "tonight"    "transport"  "treat"      "trust"      "window"    
## [36] "yesterday"

Anchored the first capturing group then followed by zero or more characters and anchor $ to match the end of the string with the capturing group.

# Contain a repeated pair of letters (e.g. "church" contains "ch" repeated twice.)
str_subset(words, '(..).*\\1')
##  [1] "appropriate" "church"      "condition"   "decide"      "environment"
##  [6] "london"      "paragraph"   "particular"  "photograph"  "prepare"    
## [11] "pressure"    "remember"    "represent"   "require"     "sense"      
## [16] "therefore"   "understand"  "whether"
# Contain one letter repeated in at least three places (e.g. "eleven" contains three "e"s.)
str_subset(words, "(.).*\\1.*\\1")
##  [1] "appropriate" "available"   "believe"     "between"     "business"   
##  [6] "degree"      "difference"  "discuss"     "eleven"      "environment"
## [11] "evidence"    "exercise"    "expense"     "experience"  "individual" 
## [16] "paragraph"   "receive"     "remember"    "represent"   "telephone"  
## [21] "therefore"   "tomorrow"