Week3_Assignment

Copy the introductory example. The vector name stores the extracted names. R> name [1] “Moe Szyslak” “Burns, C. Montgomery” “Rev. Timothy Lovejoy” [4] “Ned Flanders” “Simpson, Homer” “Dr. Julius Hibbert”

library(stringr)

raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name

## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.
Stringr was used to combine steps into a single line of code.

name.fl<-str_trim(sub("([^,]+),([^,]+)","\\2 \\1", sub("[[:alpha:].]{1,3}\\. ","",name)))
cbind(name,name.fl)

##      name                   name.fl           
## [1,] "Moe Szyslak"          "Moe Szyslak"     
## [2,] "Burns, C. Montgomery" "Montgomery Burns"
## [3,] "Rev. Timothy Lovejoy" "Timothy Lovejoy" 
## [4,] "Ned Flanders"         "Ned Flanders"    
## [5,] "Simpson, Homer"       "Homer Simpson"   
## [6,] "Dr. Julius Hibbert"   "Julius Hibbert"

Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).

title<-str_detect(name,"[[:alpha:].]{2,3}\\. ")
cbind(name,title)

##      name                   title  
## [1,] "Moe Szyslak"          "FALSE"
## [2,] "Burns, C. Montgomery" "FALSE"
## [3,] "Rev. Timothy Lovejoy" "TRUE" 
## [4,] "Ned Flanders"         "FALSE"
## [5,] "Simpson, Homer"       "FALSE"
## [6,] "Dr. Julius Hibbert"   "TRUE"

Construct a logical vector indicating whether a character has a second name.
Two answers below; the reformatted names exclude middle initial, whille the original unformatted list retains the middle initial.

#derived from rearranged first_name last_name format (only checks for first initial)
fullname<-str_detect(name.fl,"[[:alpha:].]{1}\\. ")
#derived from the original vector (only checks for first initial)
fullname.check<-str_detect(name," [[:alpha:].]{1}\\. ")
cbind(name,fullname.check)

##      name                   fullname.check
## [1,] "Moe Szyslak"          "FALSE"       
## [2,] "Burns, C. Montgomery" "TRUE"        
## [3,] "Rev. Timothy Lovejoy" "FALSE"       
## [4,] "Ned Flanders"         "FALSE"       
## [5,] "Simpson, Homer"       "FALSE"       
## [6,] "Dr. Julius Hibbert"   "FALSE"

cbind(name.fl,fullname)

##      name.fl            fullname
## [1,] "Moe Szyslak"      "FALSE" 
## [2,] "Montgomery Burns" "FALSE" 
## [3,] "Timothy Lovejoy"  "FALSE" 
## [4,] "Ned Flanders"     "FALSE" 
## [5,] "Homer Simpson"    "FALSE" 
## [6,] "Julius Hibbert"   "FALSE"

Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.

[0-9]+\\$
\\b[a-z]{1,4}\\b
.*?\\.txt$
\\d{2}/\d{2}/\\d{4}
<(.+?)>.+?</\\1>

The strings “1000$”," 10$“,” 45$“,”5.00“,”abcdef" are tested for (a).

answer_a<-c("1000$"," 10$"," 45$","5.00","abcdef")
str_detect(answer_a,"([0-9]+\\$)")

## [1]  TRUE  TRUE  TRUE FALSE FALSE

answer_a_str<-str_c(str_extract_all(answer_a,"([0-9]+\\$)"),collapse=', ')

The strings “now is the time for all good peeps”," a “,”four four twos ones" are tested for (b).

answer_b<-c("now is the time for all good peeps "," a ","four four twos ones")
str_detect(answer_b,"\\b[a-z]{1,4}\\b")

## [1] TRUE TRUE TRUE

answer_b_str<-str_c(str_extract_all(answer_b,"\\b[a-z]{1,4}\\b"),collapse=', ')

The strings “file.name.txt”,“code_2_3_2019.txt”,“filename”,“google.com” are tested for (c).

answer_c<-c("file.name.txt","code_2_3_2019.txt","filename","google.com")
str_detect(answer_c,".*?\\.txt$")

## [1]  TRUE  TRUE FALSE FALSE

answer_c_str<-str_c(str_extract_all(answer_c,".*?\\.txt$"),collapse=', ')

The string “11/12/2019” is tested for (d).

answer_d<-"11/12/2019"
str_detect(answer_d,"\\d{2}/\\d{2}/\\d{4}")

## [1] TRUE

answer_d_str<-str_c(str_extract_all(answer_d,"\\d{2}/\\d{2}/\\d{4}"),collapse=', ')

Strings containing HTML tags are tested for (e).

answer_e<-c("<div>paragraph or graphic</div>","<p>paragraph</p>", "<div>The cat jumped over the moon. The dog quickly followed. The mouse just chilled.</div>")
str_detect(answer_e,"<(.+?)>.+?</\\1>")

## [1] TRUE TRUE TRUE

answer_e_str<-str_c(str_extract_all(answer_e,"<(.+?)>.+?</\\1>"),collapse=', ')

Answers are combined in a dataframe and displayed using kableExtra. Backslashes are altered in knitr, so a footnote is added to explain.

ex<-data.frame(example=c(answer_a_str,answer_b_str,answer_c_str,answer_d_str,answer_e_str))
ptrn<-data.frame(pattern=c("[0-9]+\\\\$","\\\\b[a-z]{1,4}\\\\b",".*?\\\\.txt$","\\\\d{2}/\\\\d{2}/\\\\d{4}","<(.+?)>.+?</\\\\1>"))
desc<-data.frame(description=c("numbers with dollar signs","words of length 1 to 4","file names ending in .txt","dates in format ##/##/####","text wrapped in html tags"))

answ_ex<-cbind(ptrn,ex,desc)
#answ_ex[1]<-NULL
answ_ex['example'] = lapply(answ_ex['example'], gsub, pattern="character[(]0[)]|c[(]|[)]|[\"]", replacement='')
answ_ex['example'] = lapply(answ_ex['example'], gsub, pattern="\\,", replacement="")

library(kableExtra)
anstable<-knitr::kable(answ_ex,"html",escape=TRUE)%>%
  kable_styling("striped",full_width = F)
anstable<-column_spec(anstable,2:3,width_max = "40em")
add_footnote(anstable, label = "Backslashes in the patterns above are actually doubled")

pattern	example	description
[0-9]+\$	1000$ 10$ 45$	numbers with dollar signs
\b[a-z]{1,4}\b	now is the time for all good a four four twos ones	words of length 1 to 4
.*?\.txt$	file.name.txt code_2_3_2019.txt	file names ending in .txt
\d{2}/\d{2}/\d{4}	11/12/2019	dates in format ##/##/####
<(.+?)>.+?</\1>	<div>paragraph or graphic</div> <p>paragraph</p> <div>The cat jumped over the moon. The dog quickly followed. The mouse just chilled.</div>	text wrapped in html tags
^a Backslashes in the patterns above are actually doubled

The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com.

string<-"clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo
Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO
d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5
fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
str_replace_all(string,"([a-z])|([0-9])|\n","")

## [1] "CONGRATULATIONS.YOU.ARE.A.SUPERNERD!"

Week3_Assignment

Stephen Jones

February 12, 2019