CUNY 607

Problem 3

raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5543642Dr. Julius Hibbert"
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name

## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

### (a)  [Note: The question is slightly ambiguous. I assume they want no titles, and that e.g., "C." is considered a first name.]
# create vector of titles
titles <- c("Dr\\.", "Rev\\.", "Mr\\.", "Mrs\\.", "Miss", "Ms\\.")
# split names with commas and paste in correct order
name_first_last <- unlist(lapply(strsplit(name, ", "), function(x) ifelse(length(x)>1, paste0(x[2], " ", x[1]), x)))
# loop through titles and gsub out
for (i in 1:length(titles)){name_first_last <- trimws( gsub(titles[i], "", name_first_last) )}
# get rid of any middle names
name_first_last <- unlist(lapply(strsplit(name_first_last, " "), function(x) paste0(head(x,1), " ", tail(x,1))))
name_first_last

## [1] "Moe Szyslak"     "C. Burns"        "Timothy Lovejoy" "Ned Flanders"   
## [5] "Homer Simpson"   "Julius Hibbert"

### (b)
# apply grepl to the titles vector and combine into one boolean vector
title_bool <- rowSums(sapply(titles, function(x) grepl(x, name)))>0
title_bool

## [1] FALSE FALSE  TRUE FALSE FALSE  TRUE

### (c)  [Note: I assume they mean "second name, excluding title"]
# find second names by two spaces and no title
two_spaces <- unlist(lapply(str_locate_all(name, " "), function(x) nrow(x)>1))
two_spaces & !title_bool

## [1] FALSE  TRUE FALSE FALSE FALSE FALSE

Problem 4

### (a) This regex matches sequential numbers followed by a dollar sign
str_extract_all("It is odd to want 199$ and not $199.", "[0-9]+\\$")

## [[1]]
## [1] "199$"

### (b) sequential lower case letters between 1 and 4 characters long that start or end with a word edge
str_extract_all("Cost produces NO match, BUT cost WILL.", "\\b[a-z]{1,4}\\b")

## [[1]]
## [1] "cost"

### (c) shortest possible sequence of any characters before and including the literal ".txt"
str_extract_all("file.txt and then all of these words plus file2.txt ", ".*?\\.txt")

## [[1]]
## [1] "file.txt"                                   
## [2] " and then all of these words plus file2.txt"

### (d) a sequence of two numbers and a slash and two more numbers and a slash and four more numbers
str_extract_all("1/3/98 will not get matched but 12/21/1976 will", "\\d{2}/\\d{2}/\\d{4}")

## [[1]]
## [1] "12/21/1976"

### (e) captures text between html tags by saving a reference of the opening tag
str_extract_all("This text won't match but <body> This whole element will. </body>", "<(.+?)>.+?</\\1>")

## [[1]]
## [1] "<body> This whole element will. </body>"

Problem 9

str_extract_all("clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo
Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO
d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5
fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr", "[A-Z]")

## [[1]]
##  [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "Y" "O"
## [18] "U" "A" "R" "E" "A" "S" "U" "P" "E" "R" "N" "E" "R" "D"

CUNY 607

Week 3 HW Assignment

mehtablocker

February 13, 2019

Problem 3

Problem 4

Problem 9