1. Copy the introductory example. The vector name stores the extracted names. R> name [1] “Moe Szyslak” “Burns, C. Montgomery” “Rev. Timothy Lovejoy” [4] “Ned Flanders” “Simpson, Homer” “Dr. Julius Hibbert”
library(stringr)

raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name
## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"
  1. Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.
    Stringr was used to combine steps into a single line of code.
name.fl<-str_trim(sub("([^,]+),([^,]+)","\\2 \\1", sub("[[:alpha:].]{1,3}\\. ","",name)))
cbind(name,name.fl)
##      name                   name.fl           
## [1,] "Moe Szyslak"          "Moe Szyslak"     
## [2,] "Burns, C. Montgomery" "Montgomery Burns"
## [3,] "Rev. Timothy Lovejoy" "Timothy Lovejoy" 
## [4,] "Ned Flanders"         "Ned Flanders"    
## [5,] "Simpson, Homer"       "Homer Simpson"   
## [6,] "Dr. Julius Hibbert"   "Julius Hibbert"
  1. Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).
title<-str_detect(name,"[[:alpha:].]{2,3}\\. ")
cbind(name,title)
##      name                   title  
## [1,] "Moe Szyslak"          "FALSE"
## [2,] "Burns, C. Montgomery" "FALSE"
## [3,] "Rev. Timothy Lovejoy" "TRUE" 
## [4,] "Ned Flanders"         "FALSE"
## [5,] "Simpson, Homer"       "FALSE"
## [6,] "Dr. Julius Hibbert"   "TRUE"
  1. Construct a logical vector indicating whether a character has a second name.
    Two answers below; the reformatted names exclude middle initial, whille the original unformatted list retains the middle initial.
#derived from rearranged first_name last_name format (only checks for first initial)
fullname<-str_detect(name.fl,"[[:alpha:].]{1}\\. ")
#derived from the original vector (only checks for first initial)
fullname.check<-str_detect(name," [[:alpha:].]{1}\\. ")
cbind(name,fullname.check)
##      name                   fullname.check
## [1,] "Moe Szyslak"          "FALSE"       
## [2,] "Burns, C. Montgomery" "TRUE"        
## [3,] "Rev. Timothy Lovejoy" "FALSE"       
## [4,] "Ned Flanders"         "FALSE"       
## [5,] "Simpson, Homer"       "FALSE"       
## [6,] "Dr. Julius Hibbert"   "FALSE"
cbind(name.fl,fullname)
##      name.fl            fullname
## [1,] "Moe Szyslak"      "FALSE" 
## [2,] "Montgomery Burns" "FALSE" 
## [3,] "Timothy Lovejoy"  "FALSE" 
## [4,] "Ned Flanders"     "FALSE" 
## [5,] "Homer Simpson"    "FALSE" 
## [6,] "Julius Hibbert"   "FALSE"
  1. Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.
  1. [0-9]+\\$
  2. \\b[a-z]{1,4}\\b
  3. .*?\\.txt$
  4. \\d{2}/\d{2}/\\d{4}
  5. <(.+?)>.+?</\\1>

The strings “1000$”," 10$“,” 45$“,”5.00“,”abcdef" are tested for (a).

answer_a<-c("1000$"," 10$"," 45$","5.00","abcdef")
str_detect(answer_a,"([0-9]+\\$)")
## [1]  TRUE  TRUE  TRUE FALSE FALSE
answer_a_str<-str_c(str_extract_all(answer_a,"([0-9]+\\$)"),collapse=', ')

The strings “now is the time for all good peeps”," a “,”four four twos ones" are tested for (b).

answer_b<-c("now is the time for all good peeps "," a ","four four twos ones")
str_detect(answer_b,"\\b[a-z]{1,4}\\b")
## [1] TRUE TRUE TRUE
answer_b_str<-str_c(str_extract_all(answer_b,"\\b[a-z]{1,4}\\b"),collapse=', ')

The strings “file.name.txt”,“code_2_3_2019.txt”,“filename”,“google.com” are tested for (c).

answer_c<-c("file.name.txt","code_2_3_2019.txt","filename","google.com")
str_detect(answer_c,".*?\\.txt$")
## [1]  TRUE  TRUE FALSE FALSE
answer_c_str<-str_c(str_extract_all(answer_c,".*?\\.txt$"),collapse=', ')

The string “11/12/2019” is tested for (d).

answer_d<-"11/12/2019"
str_detect(answer_d,"\\d{2}/\\d{2}/\\d{4}")
## [1] TRUE
answer_d_str<-str_c(str_extract_all(answer_d,"\\d{2}/\\d{2}/\\d{4}"),collapse=', ')

Strings containing HTML tags are tested for (e).

answer_e<-c("<div>paragraph or graphic</div>","<p>paragraph</p>", "<div>The cat jumped over the moon. The dog quickly followed. The mouse just chilled.</div>")
str_detect(answer_e,"<(.+?)>.+?</\\1>")
## [1] TRUE TRUE TRUE
answer_e_str<-str_c(str_extract_all(answer_e,"<(.+?)>.+?</\\1>"),collapse=', ')

Answers are combined in a dataframe and displayed using kableExtra. Backslashes are altered in knitr, so a footnote is added to explain.

ex<-data.frame(example=c(answer_a_str,answer_b_str,answer_c_str,answer_d_str,answer_e_str))
ptrn<-data.frame(pattern=c("[0-9]+\\\\$","\\\\b[a-z]{1,4}\\\\b",".*?\\\\.txt$","\\\\d{2}/\\\\d{2}/\\\\d{4}","<(.+?)>.+?</\\\\1>"))
desc<-data.frame(description=c("numbers with dollar signs","words of length 1 to 4","file names ending in .txt","dates in format ##/##/####","text wrapped in html tags"))

answ_ex<-cbind(ptrn,ex,desc)
#answ_ex[1]<-NULL
answ_ex['example'] = lapply(answ_ex['example'], gsub, pattern="character[(]0[)]|c[(]|[)]|[\"]", replacement='')
answ_ex['example'] = lapply(answ_ex['example'], gsub, pattern="\\,", replacement="")

library(kableExtra)
anstable<-knitr::kable(answ_ex,"html",escape=TRUE)%>%
  kable_styling("striped",full_width = F)
anstable<-column_spec(anstable,2:3,width_max = "40em")
add_footnote(anstable, label = "Backslashes in the patterns above are actually doubled")
pattern example description
[0-9]+\$ 1000$ 10$ 45$ numbers with dollar signs
\b[a-z]{1,4}\b now is the time for all good a four four twos ones words of length 1 to 4
.*?\.txt$ file.name.txt code_2_3_2019.txt file names ending in .txt
\d{2}/\d{2}/\d{4} 11/12/2019 dates in format ##/##/####
<(.+?)>.+?</\1> <div>paragraph or graphic</div> <p>paragraph</p> <div>The cat jumped over the moon. The dog quickly followed. The mouse just chilled.</div> text wrapped in html tags
a Backslashes in the patterns above are actually doubled
  1. The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com.
string<-"clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo
Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO
d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5
fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
str_replace_all(string,"([a-z])|([0-9])|\n","")
## [1] "CONGRATULATIONS.YOU.ARE.A.SUPERNERD!"