DATA607_Week3

Exercise 1. Matching strings

library(stringr)

majorsURL <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv"
majors <- read.csv(file = majorsURL, header = TRUE, sep = ",")

str_subset(majors$Major, "STATISTICS")

## [1] "MANAGEMENT INFORMATION SYSTEMS AND STATISTICS"
## [2] "STATISTICS AND DECISION SCIENCE"

str_subset(majors$Major, "DATA")

## [1] "COMPUTER PROGRAMMING AND DATA PROCESSING"

Exercise 2. Collapsing strings

line1<- c("bell pepper", "bilberry",     "blackberry",   "blood orange")

line5<- c("blueberry", "cantaloupe",   "chili pepper", "cloudberry")

line9<- c("elderberry",   "lime",         "lychee",       "mulberry")  

line13<- c("olive",        "salal berry")

str_c(c(line1, line5, line9, line13), collapse = ", ")

## [1] "bell pepper, bilberry, blackberry, blood orange, blueberry, cantaloupe, chili pepper, cloudberry, elderberry, lime, lychee, mulberry, olive, salal berry"

Exercise 3. Describe, in words, what these expressions will match:

###a. This regexp will match any single character and then the literal string “\1\1.”

test <- list("777", "data\1\1", "anna", "2002", '"elle"', "abc  d\1\1", "\1\1\1\1\1\1\1\1\1", "a  a  a    ")
str_view(test, "(.)\1\1", match = TRUE)

###b. This regexp will match two characters to return a word that is symmetrical (matching the two inner characters and the two outer characters) and is four characters long.

test <- list("dada","daad", "ddaaaa","777", "data\1\1", "anna", "2002", '"elle"', "abc  d\1\1", "\1\1\1\1\1\1\1\1\1", "a  a  a    ")
str_view(test, "(.)(.)\\2\\1", match = TRUE)

###c. This regexp will match any two characters followed by the literal string “\1.”

test <- list("dada","daad", "ddaaaa","777", "data\1\1", "anna", "2002", '"elle"', "abc  d\1\1", "\1\1\1\1\1\1\1\1\1", "a  a  a    ")
str_view(test, "(..)\1", match = TRUE)

###d. This regexp will match characters where the 3rd and 5th character match the first.

test <- list("dada","daad", "ddaaaa","777", "data\1\1", "anna", "2002", '"elle"', "abc  d\1\1", "\1\1\1\1\1\1\1\1\1", "a  a  a  vvvvv  ", "a    a    a       ")
str_view(test, "(.).\\1.\\1", match = TRUE)

###e. This regexp will match up on a pattern which repeats zero or more times, where the strings first three characters match the last three characters but in reverse.

test <- list("dada","daad", "ddaaaa","777", "data\1\1", "anna", "2002", '"elle"', "abc  d\1\1", "\1\1\1\1\1\1\1\1\1", "a  a  a  vvvvv  ", "a    a    a       ")
str_view(test, "(.)(.)(.).*\\3\\2\\1", match = TRUE)

4 Construct regular expressions to match words that:

a. Start and end with the same character.

a<- list("dad","mom","pop","lap","diamond","greg","tim")
str_view(a,"^(.).*\\1$",match = TRUE)

b. Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.)

b<- list("church","shush","papa","cookbook","diamond","greg","tim")
str_view(b,"(..).*\\1",match = TRUE)

c. Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)

c<- list("dad","mom","pop","lap","church","shush","papa","cookbook","diamond","greg","tim","eleven","elephant","Mississippi")
str_view(c,"([a-z]).*\\1.*\\1",match = TRUE)