607 Week3 Assignment

Problems 3 and 4 from chapter 8 of Automated Data Collection in R.

Problem3: Here is the referenced code for the introductory example in #3:

  1. Copy the introductory example. The vector name stores the extracted names.
#Load package
library(stringr)
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"   

raw.data
## [1] "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))

phone <- unlist(str_extract_all(raw.data, "\\(?(\\d{3})?\\)?(-| )?\\d{3}(-| )?\\d{4}"))

name
## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"
phone
## [1] "555-1239"       "(636) 555-0113" "555-6542"       "555 8904"      
## [5] "636-555-3226"   "5553642"
#phone number is in the format XXX-XXX-XXXX

3.1. Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.

# remove initials
name2 <- sub(" [A-z]{1}\\. "," ",name)  
name2
## [1] "Moe Szyslak"          "Burns, Montgomery"    "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"
# switch last,first to first last
name3 <- sub("(\\w+),\\s(\\w+)","\\2 \\1", name2)  
# two names has "," which need to be switched.
name3
## [1] "Moe Szyslak"          "Montgomery Burns"     "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Homer Simpson"        "Dr. Julius Hibbert"
# remove titles
name4 <- sub("[A-z]{2,3}\\. ","",name3)  
name4
## [1] "Moe Szyslak"      "Montgomery Burns" "Timothy Lovejoy" 
## [4] "Ned Flanders"     "Homer Simpson"    "Julius Hibbert"

3.2. Construct a logical vector indicating whether a character has a title.

title_yn <- str_detect(name3,"[A-z]{2,3}\\. ")
df <- data.frame(name3,title_yn)
df
##                  name3 title_yn
## 1          Moe Szyslak    FALSE
## 2     Montgomery Burns    FALSE
## 3 Rev. Timothy Lovejoy     TRUE
## 4         Ned Flanders    FALSE
## 5        Homer Simpson    FALSE
## 6   Dr. Julius Hibbert     TRUE
# two of the name has title of which one is 2 char and the other is 3 char

3.3. Construct a logical vector indicating whether a character has a second name

second_name <- str_detect(name," [A-z]{1}\\. ")
df <- data.frame(name,second_name)
df
##                   name second_name
## 1          Moe Szyslak       FALSE
## 2 Burns, C. Montgomery        TRUE
## 3 Rev. Timothy Lovejoy       FALSE
## 4         Ned Flanders       FALSE
## 5       Simpson, Homer       FALSE
## 6   Dr. Julius Hibbert       FALSE
# One name has a middle name.

Problem 4

  1. Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression. 4.1. [0-9]+\$
#This expression describes string of one or more digits followed by dollar sign.
pattern="[0-9]+\\$"
examples=c("123$43","xyz123$9","XYZ$")
str_detect(examples,pattern)
## [1]  TRUE  TRUE FALSE

4.2. \b[a-z]{1,4}\b

#This expression describes string of one to four lower case letters.
words="\\b[a-z]{1,4}\\b"
examples=c("piG2","CAT","dogs")
str_detect(examples,words)
## [1] FALSE FALSE  TRUE

4.3. .*?\.txt$

#This expression describes a string ending in “.txt”.
pattern=".*?\\.txt$"
examples=c(".txt","test.txt","abc/123.txt","a$b#1.txt")
str_detect(examples,pattern)
## [1] TRUE TRUE TRUE TRUE

4.4. \d{2}/\d{2}/\d{4}

#This expression describes 2 digits followed by a forward slash, 2 more digits, another forward slash, and four more digits like a date format 02/14/2017
pattern="\\d{2}/\\d{2}/\\d{4}"
examples=c("02/14/2017","DD11/12MM/2020RR")
str_detect(examples,pattern)
## [1]  TRUE FALSE

4.5. <(.+?)>.+?</\1>

#This expression describes XML tags.
pattern="<(.+?)>.+?</\\1>"
examples=c("<tag>Text</tag>","<tag>123</tag>")
str_detect(examples,pattern)
## [1] TRUE TRUE

Problem 9

  1. The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com.
hidden <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"

unlist(str_extract_all(hidden, "[A-Z]"))
##  [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "Y" "O"
## [18] "U" "A" "R" "E" "A" "S" "U" "P" "E" "R" "N" "E" "R" "D"
unlist(str_extract_all(hidden, "[a-z]"))
##   [1] "c" "l" "c" "o" "p" "o" "w" "z" "m" "s" "t" "c" "d" "w" "n" "k" "i"
##  [18] "g" "v" "d" "i" "c" "p" "u" "g" "g" "v" "h" "r" "y" "n" "j" "u" "w"
##  [35] "c" "z" "i" "h" "q" "r" "f" "p" "x" "s" "j" "d" "w" "p" "n" "a" "n"
##  [52] "w" "o" "w" "i" "s" "d" "i" "j" "j" "k" "p" "f" "d" "r" "c" "o" "c"
##  [69] "b" "t" "y" "c" "z" "j" "a" "t" "a" "o" "o" "t" "j" "t" "j" "n" "e"
##  [86] "c" "f" "e" "k" "r" "w" "w" "w" "o" "j" "i" "g" "d" "v" "r" "f" "r"
## [103] "b" "z" "b" "k" "n" "b" "h" "z" "g" "v" "i" "z" "c" "r" "o" "p" "w"
## [120] "g" "n" "b" "q" "o" "f" "a" "o" "t" "f" "b" "w" "m" "k" "t" "s" "z"
## [137] "q" "e" "f" "y" "n" "d" "t" "k" "c" "f" "g" "m" "c" "g" "x" "o" "n"
## [154] "h" "k" "g" "r"
unlist(str_extract_all(hidden, "[0-9]"))
##  [1] "1" "0" "8" "7" "7" "9" "2" "8" "5" "5" "0" "7" "8" "0" "3" "5" "3"
## [18] "0" "7" "5" "5" "3" "3" "6" "4" "1" "1" "6" "2" "2" "4" "9" "0" "5"
## [35] "6" "5" "1" "7" "2" "4" "6" "3" "9" "5" "8" "9" "6" "5" "9" "4" "9"
## [52] "0" "5" "4" "5"
#The hidden message is revealed by extracting just the uppercase characters