Problems 3 and 4 from chapter 8 of Automated Data Collection in R.
Problem3: Here is the referenced code for the introductory example in #3:
#Load package
library(stringr)
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
raw.data
## [1] "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
phone <- unlist(str_extract_all(raw.data, "\\(?(\\d{3})?\\)?(-| )?\\d{3}(-| )?\\d{4}"))
name
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
phone
## [1] "555-1239" "(636) 555-0113" "555-6542" "555 8904"
## [5] "636-555-3226" "5553642"
#phone number is in the format XXX-XXX-XXXX
3.1. Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.
# remove initials
name2 <- sub(" [A-z]{1}\\. "," ",name)
name2
## [1] "Moe Szyslak" "Burns, Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
# switch last,first to first last
name3 <- sub("(\\w+),\\s(\\w+)","\\2 \\1", name2)
# two names has "," which need to be switched.
name3
## [1] "Moe Szyslak" "Montgomery Burns" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Homer Simpson" "Dr. Julius Hibbert"
# remove titles
name4 <- sub("[A-z]{2,3}\\. ","",name3)
name4
## [1] "Moe Szyslak" "Montgomery Burns" "Timothy Lovejoy"
## [4] "Ned Flanders" "Homer Simpson" "Julius Hibbert"
3.2. Construct a logical vector indicating whether a character has a title.
title_yn <- str_detect(name3,"[A-z]{2,3}\\. ")
df <- data.frame(name3,title_yn)
df
## name3 title_yn
## 1 Moe Szyslak FALSE
## 2 Montgomery Burns FALSE
## 3 Rev. Timothy Lovejoy TRUE
## 4 Ned Flanders FALSE
## 5 Homer Simpson FALSE
## 6 Dr. Julius Hibbert TRUE
# two of the name has title of which one is 2 char and the other is 3 char
3.3. Construct a logical vector indicating whether a character has a second name
second_name <- str_detect(name," [A-z]{1}\\. ")
df <- data.frame(name,second_name)
df
## name second_name
## 1 Moe Szyslak FALSE
## 2 Burns, C. Montgomery TRUE
## 3 Rev. Timothy Lovejoy FALSE
## 4 Ned Flanders FALSE
## 5 Simpson, Homer FALSE
## 6 Dr. Julius Hibbert FALSE
# One name has a middle name.
#This expression describes string of one or more digits followed by dollar sign.
pattern="[0-9]+\\$"
examples=c("123$43","xyz123$9","XYZ$")
str_detect(examples,pattern)
## [1] TRUE TRUE FALSE
4.2. \b[a-z]{1,4}\b
#This expression describes string of one to four lower case letters.
words="\\b[a-z]{1,4}\\b"
examples=c("piG2","CAT","dogs")
str_detect(examples,words)
## [1] FALSE FALSE TRUE
4.3. .*?\.txt$
#This expression describes a string ending in “.txt”.
pattern=".*?\\.txt$"
examples=c(".txt","test.txt","abc/123.txt","a$b#1.txt")
str_detect(examples,pattern)
## [1] TRUE TRUE TRUE TRUE
4.4. \d{2}/\d{2}/\d{4}
#This expression describes 2 digits followed by a forward slash, 2 more digits, another forward slash, and four more digits like a date format 02/14/2017
pattern="\\d{2}/\\d{2}/\\d{4}"
examples=c("02/14/2017","DD11/12MM/2020RR")
str_detect(examples,pattern)
## [1] TRUE FALSE
4.5. <(.+?)>.+?</\1>
#This expression describes XML tags.
pattern="<(.+?)>.+?</\\1>"
examples=c("<tag>Text</tag>","<tag>123</tag>")
str_detect(examples,pattern)
## [1] TRUE TRUE
hidden <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
unlist(str_extract_all(hidden, "[A-Z]"))
## [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "Y" "O"
## [18] "U" "A" "R" "E" "A" "S" "U" "P" "E" "R" "N" "E" "R" "D"
unlist(str_extract_all(hidden, "[a-z]"))
## [1] "c" "l" "c" "o" "p" "o" "w" "z" "m" "s" "t" "c" "d" "w" "n" "k" "i"
## [18] "g" "v" "d" "i" "c" "p" "u" "g" "g" "v" "h" "r" "y" "n" "j" "u" "w"
## [35] "c" "z" "i" "h" "q" "r" "f" "p" "x" "s" "j" "d" "w" "p" "n" "a" "n"
## [52] "w" "o" "w" "i" "s" "d" "i" "j" "j" "k" "p" "f" "d" "r" "c" "o" "c"
## [69] "b" "t" "y" "c" "z" "j" "a" "t" "a" "o" "o" "t" "j" "t" "j" "n" "e"
## [86] "c" "f" "e" "k" "r" "w" "w" "w" "o" "j" "i" "g" "d" "v" "r" "f" "r"
## [103] "b" "z" "b" "k" "n" "b" "h" "z" "g" "v" "i" "z" "c" "r" "o" "p" "w"
## [120] "g" "n" "b" "q" "o" "f" "a" "o" "t" "f" "b" "w" "m" "k" "t" "s" "z"
## [137] "q" "e" "f" "y" "n" "d" "t" "k" "c" "f" "g" "m" "c" "g" "x" "o" "n"
## [154] "h" "k" "g" "r"
unlist(str_extract_all(hidden, "[0-9]"))
## [1] "1" "0" "8" "7" "7" "9" "2" "8" "5" "5" "0" "7" "8" "0" "3" "5" "3"
## [18] "0" "7" "5" "5" "3" "3" "6" "4" "1" "1" "6" "2" "2" "4" "9" "0" "5"
## [35] "6" "5" "1" "7" "2" "4" "6" "3" "9" "5" "8" "9" "6" "5" "9" "4" "9"
## [52] "0" "5" "4" "5"
#The hidden message is revealed by extracting just the uppercase characters