#install.packages('stringr')

chap 8

Q3 Copy the introductory example. The vector name stores the extracted names.

R> name [1] “Moe Szyslak” “Burns, C. Montgomery” “Rev. Timothy Lovejoy” [4] “Ned Flanders” “Simpson, Homer” “Dr. Julius Hibbert”

library('stringr')
library('tidyr')
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert" 

#[:alpha:]: alpjabetic characters: a-z and A-Z
#{n,} The preceding item is matched n or more times
name<-unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name
## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

(a) Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.

#split each string and store into a 4 column table t
t<-(str_split_fixed(name, "[[:blank:]]",4)) 
t
##      [,1]       [,2]       [,3]         [,4]
## [1,] "Moe"      "Szyslak"  ""           ""  
## [2,] "Burns,"   "C."       "Montgomery" ""  
## [3,] "Rev."     "Timothy"  "Lovejoy"    ""  
## [4,] "Ned"      "Flanders" ""           ""  
## [5,] "Simpson," "Homer"    ""           ""  
## [6,] "Dr."      "Julius"   "Hibbert"    ""
# rename columnname
colnames(t)<-c("firstname","lastname","secondname", "title") 

#grep string in first column which has ',', and do swap firstname and lastname value.
t[grep(",",t[,1]), c("lastname","firstname")] <- t[grep(",",t[,1]), c("firstname","lastname")]
t
##      firstname lastname   secondname   title
## [1,] "Moe"     "Szyslak"  ""           ""   
## [2,] "C."      "Burns,"   "Montgomery" ""   
## [3,] "Rev."    "Timothy"  "Lovejoy"    ""   
## [4,] "Ned"     "Flanders" ""           ""   
## [5,] "Homer"   "Simpson," ""           ""   
## [6,] "Dr."     "Julius"   "Hibbert"    ""
#grep string in first column which has '.' and match 2 letter, and place that value to title; also replace lastname and secondname with firstname and lastname; secondname becomes empty.
t[grep(".{2,}\\.",t[,1]), c("title","lastname","firstname","secondname")] <- t[grep(".{2,}\\.",t[,1]), c("firstname","secondname","lastname","title")]
t
##      firstname lastname   secondname   title 
## [1,] "Moe"     "Szyslak"  ""           ""    
## [2,] "C."      "Burns,"   "Montgomery" ""    
## [3,] "Timothy" "Lovejoy"  ""           "Rev."
## [4,] "Ned"     "Flanders" ""           ""    
## [5,] "Homer"   "Simpson," ""           ""    
## [6,] "Julius"  "Hibbert"  ""           "Dr."
#If the strng has ",", replace a varch "," with "" in string
t[,2]<-str_replace(t[,2],pattern=",",replacement="")
show(t)
##      firstname lastname   secondname   title 
## [1,] "Moe"     "Szyslak"  ""           ""    
## [2,] "C."      "Burns"    "Montgomery" ""    
## [3,] "Timothy" "Lovejoy"  ""           "Rev."
## [4,] "Ned"     "Flanders" ""           ""    
## [5,] "Homer"   "Simpson"  ""           ""    
## [6,] "Julius"  "Hibbert"  ""           "Dr."

(b) Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).

#select non empty string from the title column
titleVec<-subset(t,t[,4]!="")
show(titleVec)
##      firstname lastname  secondname title 
## [1,] "Timothy" "Lovejoy" ""         "Rev."
## [2,] "Julius"  "Hibbert" ""         "Dr."

(c) Construct a logical vector indicating whether a character has a second name.

secondNameVec<-subset(t,t[,3]!="")
show(secondNameVec)
##      firstname lastname secondname   title
## [1,] "C."      "Burns"  "Montgomery" ""

4. Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.

(a) [0-9]+\$

Ans: first One or more digits with the end with $ sign

test<-c("m578$/search?fr\\445kk55$=mcaf")
str_extract(test, "[0-9]+\\$")
## [1] "578$"

(b) \b[a-z]{1,4}\b

Ans: word boundary one or max 4 lower case alphabet[a,z] before end of word boundary

test<-c("abch")
str_extract(test, "\\b[a-z]{1,4}\\b")
## [1] "abch"

(c) .*?\.txt$

Ans: .* means to match 0 or more least number of characters ending with .txt

test<-c(".....dsepple.txt")
str_extract(test, ".*?\\.txt$")
## [1] ".....dsepple.txt"

(d) \d{2}/\d{2}/\d{4}

Ans: two digit / 2 digits / 4 digits.

test<-c("01/10/2017")
str_extract(test, "\\d{2}/\\d{2}/\\d{4}")
## [1] "01/10/2017"

(e) <(.+?)>.+?</\1>

Ans: followed by least number of 1 or more characters,and ‘’ means follow </ 1st pattern matched inside ()>

test<-c("<html>dr35</html>")
str_extract(test, "<(.+?)>.+?</\\1>")
## [1] "<html>dr35</html>"

9. The following code hides a secret message. Crack it with R and regular expressions.

Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com. clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr

raw.data<-"clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo
Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO
d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5
fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"

s<-unlist(str_extract_all(raw.data, "[:upper:]"))
s
##  [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "Y" "O"
## [18] "U" "A" "R" "E" "A" "S" "U" "P" "E" "R" "N" "E" "R" "D"