library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.6     v dplyr   1.0.3
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Question 1

Using the 173 majors listed in fivethirtyeight.com’s College Majors dataset [https://fivethirtyeight.com/features/the-economic-guide-to-picking-a-college-major/], provide code that identifies the majors that contain either “DATA” or “STATISTICS”

major_data <- read_csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   FOD1P = col_character(),
##   Major = col_character(),
##   Major_Category = col_character()
## )
head(major_data)
## # A tibble: 6 x 3
##   FOD1P Major                                 Major_Category                 
##   <chr> <chr>                                 <chr>                          
## 1 1100  GENERAL AGRICULTURE                   Agriculture & Natural Resources
## 2 1101  AGRICULTURE PRODUCTION AND MANAGEMENT Agriculture & Natural Resources
## 3 1102  AGRICULTURAL ECONOMICS                Agriculture & Natural Resources
## 4 1103  ANIMAL SCIENCES                       Agriculture & Natural Resources
## 5 1104  FOOD SCIENCE                          Agriculture & Natural Resources
## 6 1105  PLANT SCIENCE AND AGRONOMY            Agriculture & Natural Resources
filtered <- dplyr::filter(major_data, grepl('DATA|STATISTICS', Major))
head(filtered)
## # A tibble: 3 x 3
##   FOD1P Major                                         Major_Category         
##   <chr> <chr>                                         <chr>                  
## 1 6212  MANAGEMENT INFORMATION SYSTEMS AND STATISTICS Business               
## 2 2101  COMPUTER PROGRAMMING AND DATA PROCESSING      Computers & Mathematics
## 3 3702  STATISTICS AND DECISION SCIENCE               Computers & Mathematics

Question 2

Write code that transforms the data below:

[1] “bell pepper” “bilberry” “blackberry” “blood orange”

[5] “blueberry” “cantaloupe” “chili pepper” “cloudberry”

[9] “elderberry” “lime” “lychee” “mulberry”

[13] “olive” “salal berry”

Into a format like this:

c(“bell pepper”, “bilberry”, “blackberry”, “blood orange”, “blueberry”, “cantaloupe”, “chili pepper”, “cloudberry”, “elderberry”, “lime”, “lychee”, “mulberry”, “olive”, “salal berry”)

Step 1: Copy in the data

fruit_data <- '[1] "bell pepper"  "bilberry"     "blackberry"   "blood orange"

[5] "blueberry"    "cantaloupe"   "chili pepper" "cloudberry"  

[9] "elderberry"   "lime"         "lychee"       "mulberry"    

[13] "olive"        "salal berry"'
fruit_data
## [1] "[1] \"bell pepper\"  \"bilberry\"     \"blackberry\"   \"blood orange\"\n\n[5] \"blueberry\"    \"cantaloupe\"   \"chili pepper\" \"cloudberry\"  \n\n[9] \"elderberry\"   \"lime\"         \"lychee\"       \"mulberry\"    \n\n[13] \"olive\"        \"salal berry\""

Step 2: Remove undesired characters

fruit_intermediate <- str_extract_all(fruit_data, pattern = '[Za-z]+.?[Za-z]+')
fruit_intermediate
## [[1]]
##  [1] "bell pepper"  "bilberry"     "blackberry"   "blood orange" "blueberry"   
##  [6] "cantaloupe"   "chili pepper" "cloudberry"   "elderberry"   "lime"        
## [11] "lychee"       "mulberry"     "olive"        "salal berry"

Step 3: Collapse the data

fruit_final <- str_c(fruit_intermediate, collapse = ", ")
## Warning in stri_c(..., sep = sep, collapse = collapse, ignore_null = TRUE):
## argument is not an atomic vector; coercing
fruit_final
## [1] "c(\"bell pepper\", \"bilberry\", \"blackberry\", \"blood orange\", \"blueberry\", \"cantaloupe\", \"chili pepper\", \"cloudberry\", \"elderberry\", \"lime\", \"lychee\", \"mulberry\", \"olive\", \"salal berry\")"

Step 4: Use write lines to print out the data without the backslash

writeLines(fruit_final)
## c("bell pepper", "bilberry", "blackberry", "blood orange", "blueberry", "cantaloupe", "chili pepper", "cloudberry", "elderberry", "lime", "lychee", "mulberry", "olive", "salal berry")

Question 3

Note, quotes were added around the first and third expressions, otherwise they would not work. (.)\1\1 will match any character appearing three times in a row, eg the three m’s in “hmmm” (.)(.)\2\1 will match palindromes where the first two characters are the second two characters in reverse, eg “ahha” (..)\1 will match a group of two characters that repeats immediately afterwards, eg “uhuh” (.).\1.\1 will match an initial character followed by a group of two characters (a second character and the initial character) repeated twice, eg ahaha (.)(.)(.).*\3\2\1 will match a group of three characters followed by a group of the same three characters in reverse. There can be any number of characters between the two group eg “ahhhha”, “ummnooooooommu”

programmer_sounds <- c("hmmm", "ahha", "uhuh", "bwahaha", "ahhhha", "ummnooooooommu")
str_view(programmer_sounds,"(.)\\1\\1")
str_view(programmer_sounds, "(.)(.)\\2\\1")
str_view(programmer_sounds, '(..)\\1')
str_view(programmer_sounds, '(.).\\1.\\1')
str_view(programmer_sounds, "(.)(.)(.).*\\3\\2\\1")

Question 4

Construct regular expressions to match words that:

Start and end with the same character. Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.) Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)

str_view(c("pip", "doomed"), "(.).*\\1")
str_view(c("church","booboo"),"(..).*\\1")
str_view(c("eleven", "seventeen"), "(.).*\\1.*\\1")