Problems 3 and 4 from chapter 8 of Automated Data Collection in R. Problem 9 is extra credit.

3. Copy the introductory example. The vector name stores the extracted names.

R> name

[1] “Moe Szyslak” “Burns, C. Montgomery” “Rev. Timothy Lovejoy”

[4] “Ned Flanders” “Simpson, Homer” “Dr. Julius Hibbert”

raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"  

library(stringr)

name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))

phone <- unlist(str_extract_all(raw.data, "\\(?(\\d{3})?\\)?(-| )?\\d{3}(-| )?\\d{4}"))

df <- data.frame(name = name, phone = phone)
df
##                   name          phone
## 1          Moe Szyslak       555-1239
## 2 Burns, C. Montgomery (636) 555-0113
## 3 Rev. Timothy Lovejoy       555-6542
## 4         Ned Flanders       555 8904
## 5       Simpson, Homer   636-555-3226
## 6   Dr. Julius Hibbert        5553642

(a) Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.

grep(pattern = ",", x = name, value = TRUE)
## [1] "Burns, C. Montgomery" "Simpson, Homer"
name <-str_replace(name, pattern = "Burns, C. Montgomery", replacement = "C.Montgomery Burns")
name <-str_replace(name, pattern = "Simpson, Homer", replacement = "Homer Simpson")

name <-str_replace(name, pattern = "Dr.", replacement = "")
name <-str_replace(name, pattern = "Rev.", replacement = "")

library(tidyr)
df <- data.frame(name = name, phone = phone)
nameDf <-data.frame(extract(df, name, c("FirstName", "LastName"), "([^ ]+) (.*)"))
nameDf
##      FirstName LastName          phone
## 1          Moe  Szyslak       555-1239
## 2 C.Montgomery    Burns (636) 555-0113
## 3      Timothy  Lovejoy       555-6542
## 4          Ned Flanders       555 8904
## 5        Homer  Simpson   636-555-3226
## 6       Julius  Hibbert        5553642

(2) Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).

raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"  

library(stringr)
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
phone <- unlist(str_extract_all(raw.data, "\\(?(\\d{3})?\\)?(-| )?\\d{3}(-| )?\\d{4}"))

name <-str_replace(name, pattern = "Burns, C. Montgomery", replacement = "C. Montgomery Burns")
name <-str_replace(name, pattern = "Simpson, Homer", replacement = "Homer Simpson")

title <- str_detect(name, "[A-Z][a-z].*?\\.$*")
data.frame(name, title)
##                   name title
## 1          Moe Szyslak FALSE
## 2  C. Montgomery Burns FALSE
## 3 Rev. Timothy Lovejoy  TRUE
## 4         Ned Flanders FALSE
## 5        Homer Simpson FALSE
## 6   Dr. Julius Hibbert  TRUE

(c) Construct a logical vector indicating whether a character has a second name.

secondName <- str_detect(name, "[A-Z]\\.$*")
data.frame(name, secondName)
##                   name secondName
## 1          Moe Szyslak      FALSE
## 2  C. Montgomery Burns       TRUE
## 3 Rev. Timothy Lovejoy      FALSE
## 4         Ned Flanders      FALSE
## 5        Homer Simpson      FALSE
## 6   Dr. Julius Hibbert      FALSE

4. Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.

(a) [0-9]+\$

Answer: search for a string starts a or more digits in 0-9 and end with $

str1 <- c("apple$","6890$","chicken88$","8$", "999yu$","699","$")
str_extract(str1, "[0-9]+\\$")
## [1] NA      "6890$" "88$"   "8$"    NA      NA      NA

(b) \b[a-z]{1,4}\b

Answer: start a lower case word charactor and up to four lower case letters then end with break

str2 <- c("apples x4", "bag of flour", "bagofsugar", "mlk x2", "Milk")
str_extract(str2, "\\b[a-z]{1,4}\\b")
## [1] NA    "bag" NA    "mlk" NA

(c) .*?\.txt$

Answer: look for a expresion end with “.txt” regarding whatever any alphanumeric string before the expresion.

str3 <- c("lab0.txt $", "hw3.rmd", "movies.csv","999.txt$","shop1.txt", ".txt")
str_extract(str3, ".*?\\.txt$")
## [1] NA          NA          NA          NA          "shop1.txt" ".txt"

(d) \d{2}/\d{2}/\d{4}

Answer:start two digits seperated by “/” then two digits and “/” then four digits

str4 <- c("h9/30/1890", "09/17/2017", "dd/mm/yyyy","05:39:54","54/878/9287" )
str_extract(str4, "\\d{2}/\\d{2}/\\d{4}")
## [1] NA           "09/17/2017" NA           NA           NA

(e) <(.+?)>.+?</\1>

Answer:look for the first expresion starting from < > and end with </ > and the charactors in < > and </ > must be the same, like HTML language.

###.+? :there are at lease one charactor between < > and </ >

str5 <- c("Collected R wisdoms<pg>'What we have is nice'</pg><r>.'..do'.</r>" )
str_extract(str5, "<(.+?)>.+?</\\1>")
## [1] "<pg>'What we have is nice'</pg>"

9. The following code hides a secret message. Crack it with R and regular expressions.

Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com.

Answer:

str <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO
d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5
fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"   

newstr <- unlist(str_extract_all(str, "[:upper:]"))
newstr
##  [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "Y" "O"
## [18] "U" "A" "R" "E" "A" "S" "U" "P" "E" "R" "N" "E" "R" "D"