DATA 607 Week 3 Homework

3. Copy the introductory example. The vector name stores the extracted names.

R> name [1] “Moe Szyslak” “Burns, C. Montgomery” “Rev. Timothy Lovejoy” [4] “Ned Flanders” “Simpson, Homer” “Dr. Julius Hibbert”

(a) Use the tools of this chapter to rearrange the vector so that all elements conform to

the standard first_name last_name.

rawData <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"

names <- unlist(str_extract_all(rawData, "[[:alpha:]., ]{2,}"))
names

## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

# remove titles from the names
firstLastName <- str_replace_all(names, "\\b\\w{2,}?\\.", '') # the '\\b' is not necessary but can be useful 
# when the title is preceeded by another word in a sentence
firstLastName

## [1] "Moe Szyslak"          "Burns, C. Montgomery" " Timothy Lovejoy"    
## [4] "Ned Flanders"         "Simpson, Homer"       " Julius Hibbert"

(b) Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).

data.frame(names = names, hasTitle = str_detect(names, "\\b\\w{2,}?\\."))

##                  names hasTitle
## 1          Moe Szyslak    FALSE
## 2 Burns, C. Montgomery    FALSE
## 3 Rev. Timothy Lovejoy     TRUE
## 4         Ned Flanders    FALSE
## 5       Simpson, Homer    FALSE
## 6   Dr. Julius Hibbert     TRUE

(c) Construct a logical vector indicating whether a character has a second name.

data.frame(names = firstLastName, hasSecondName = str_detect(names, "\\b\\w{1,}?\\."))

##                  names hasSecondName
## 1          Moe Szyslak         FALSE
## 2 Burns, C. Montgomery          TRUE
## 3      Timothy Lovejoy          TRUE
## 4         Ned Flanders         FALSE
## 5       Simpson, Homer         FALSE
## 6       Julius Hibbert          TRUE

4. Describe the types of strings that conform to the following regular expressions and

construct an example that is matched by the regular expression.

[0-9]+\$

values <- c('The shoe worths $100', 'ab988uy$', '1234', '5000$')
matcher <- '[0-9]+\\$'

data.frame(value = values, matched = str_detect(values, matcher))

##                  value matched
## 1 The shoe worths $100   FALSE
## 2             ab988uy$   FALSE
## 3                 1234   FALSE
## 4                5000$    TRUE

\b[a-z]{1,4}\b

values <- c('The shoe worths $100', 'My Name Is Subzero', 'My name Is subzero', '5000$')
matcher <- '\\b[a-z]{1,4}\\b'

data.frame(value = values, matched = str_detect(values, matcher))

##                  value matched
## 1 The shoe worths $100    TRUE
## 2   My Name Is Subzero   FALSE
## 3   My name Is subzero    TRUE
## 4                5000$   FALSE

.*?\.txt$

values <- c('chess_players_ratings.txt', 'chess_players_ratings.pdf', 'Chess_PLAYERS_raTings.txt', 'Chess_PlAYeRS.tx')
matcher <- '.*?\\.txt$'

data.frame(value = values, matched = str_detect(values, matcher))

##                       value matched
## 1 chess_players_ratings.txt    TRUE
## 2 chess_players_ratings.pdf   FALSE
## 3 Chess_PLAYERS_raTings.txt    TRUE
## 4          Chess_PlAYeRS.tx   FALSE

\d{2}/\d{2}/\d{4}

values <- c('3/50', '20/50/5504', '99/100-33/50', '11/22/4840')
matcher <- '\\d{2}/\\d{2}/\\d{4}'

data.frame(value = values, matched = str_detect(values, matcher))

##          value matched
## 1         3/50   FALSE
## 2   20/50/5504    TRUE
## 3 99/100-33/50   FALSE
## 4   11/22/4840    TRUE

<(.+?)>.+?</\1>

values <- c('<a>click here to sign up</a>', 'sps.cuny.edu', '<button> a classic button</button>', 'https://www.amazon.com')
matcher <- '<(.+?)>.+?</\\1>'

data.frame(value = values, matched = str_detect(values, matcher))

##                                value matched
## 1       <a>click here to sign up</a>    TRUE
## 2                       sps.cuny.edu   FALSE
## 3 <button> a classic button</button>    TRUE
## 4             https://www.amazon.com   FALSE

9. The following code hides a secret message.

Crack it with R and regular expressions. ##### Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com.

#With close observations, the hidden message can be seen to be a meaningful word in upper cases mixed among the lot

nerdMs <- 'clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo
Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO
d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5
fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr'

superNerd <- str_replace_all(str_replace_all(nerdMs,"([a-z])|([0-9])|\n",""), "\\.", " ")

superNerd

## [1] "CONGRATULATIONS YOU ARE A SUPERNERD!"