Question 03

library(stringr)
raw.data<-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{10,}"))
name
## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"
#The above code is taken from textbook
  1. Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.

First change the two reversed cases of Montgomery and Homer

name<-str_replace_all(name, "(.+)(, .+)$", "\\2 \\1")
name
## [1] "Moe Szyslak"           ", C. Montgomery Burns" "Rev. Timothy Lovejoy" 
## [4] "Ned Flanders"          ", Homer Simpson"       "Dr. Julius Hibbert"

Delete the commas in the text

name<-str_replace_all(name, ", ", "")
name
## [1] "Moe Szyslak"          "C. Montgomery Burns"  "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Homer Simpson"        "Dr. Julius Hibbert"

Remove the titles

namefinal<-str_replace_all(name, "[A-Z][a-z]([a-z]?)\\.", "")
namefinal
## [1] "Moe Szyslak"         "C. Montgomery Burns" " Timothy Lovejoy"   
## [4] "Ned Flanders"        "Homer Simpson"       " Julius Hibbert"

b)Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.)

library("knitr")
data<-data.frame(name)
data$title<-str_detect(string=name, pattern = "\\w{2,}\\.")
data$title
## [1] FALSE FALSE  TRUE FALSE FALSE  TRUE
kable(data)
name title
Moe Szyslak FALSE
C. Montgomery Burns FALSE
Rev. Timothy Lovejoy TRUE
Ned Flanders FALSE
Homer Simpson FALSE
Dr. Julius Hibbert TRUE

c)Construct a logical vector indicating whether a character has a second name.

data$Sname<-str_detect(string = name, pattern = "[A-Z]{1}\\.")
data$Sname
## [1] FALSE  TRUE FALSE FALSE FALSE FALSE
kable(data)
name title Sname
Moe Szyslak FALSE FALSE
C. Montgomery Burns FALSE TRUE
Rev. Timothy Lovejoy TRUE FALSE
Ned Flanders FALSE FALSE
Homer Simpson FALSE FALSE
Dr. Julius Hibbert TRUE FALSE

Question 04

  1. [0-9]+\$
a<-c("alphanumeric456", "396$dhyth", "jhah8724627hav$", "real453$estate")
grepl("[0-9]+\\$", a)
## [1] FALSE  TRUE FALSE  TRUE
#The pattern is a string that contains numbers ending with a $
unlist(str_extract_all(a, pattern = "[0-9]+\\$" ))
## [1] "396$" "453$"
  1. \b[a-z]{1,4}\b
b<-c("Samantha", "john", "apple", "banana5", "gre", "gRe")
grepl("\\b[a-z]{1,4}\\b$", b)
## [1] FALSE  TRUE FALSE FALSE  TRUE FALSE
#The pattern is for words not exceeding four characters in all lowercase. 
unlist(str_extract_all(b, pattern = "\\b[a-z]{1,4}\\b" ))
## [1] "john" "gre"
  1. .*?\.txt$
c<-c("fox.txt", "alpha.txtjghh", "alptxt", "1234.txt")
grepl(".*?\\.txt$", c)
## [1]  TRUE FALSE FALSE  TRUE
#The pattern is for any string ending in .txt.
unlist(str_extract_all(c, pattern = ".*?\\.txt$"))
## [1] "fox.txt"  "1234.txt"
  1. \d{2}/\d{2}/\d{4}
d<-c("03/06/2019", "06/05/19", "04/03/2003", "4/5/98")
grepl("\\d{2}/\\d{2}/\\d{4}", d)
## [1]  TRUE FALSE  TRUE FALSE
#This one is for dates in the format xx/xx/xxxx
unlist(str_extract_all(d, pattern="\\d{2}/\\d{2}/\\d{4}"))
## [1] "03/06/2019" "04/03/2003"
  1. <(.+?)>.+?</\1>
e<-c("<html>between</html>", "<boy>girl</boy>", "<boy></boy>", "<girl> </girl>")
grepl("<(.+?)>.+?</\\1>", e)
## [1]  TRUE  TRUE FALSE  TRUE
#the pattern is for <text>sthinbetween</text>. Even a space in between works as in <girl> </girl>.
unlist(str_extract_all(e, pattern="<(.+?)>.+?</\\1>"))
## [1] "<html>between</html>" "<boy>girl</boy>"      "<girl> </girl>"

Question 09

mystery<-"clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"

Looks like the message in is caps, so I am removing the lowercase letters.

mystery1<-gsub(pattern="[[:lower:]]", replacement="", mystery)
mystery1
## [1] "C10877ON92G8R5A50TU7L803AT5I307O553N364S.11YO6U2.2A4R905E.A.SU65P17E2463R95896N594E9054R5D!"

Next I remove all the digits.

mystery2<-gsub(pattern="[[:digit:]]", replacement="", mystery1)
mystery2
## [1] "CONGRATULATIONS.YOU.ARE.A.SUPERNERD!"

Now I have to remove the “.” between the words

mystery_solved<-str_replace_all(mystery2, "\\.", " ")
print(mystery_solved, quote=FALSE)
## [1] CONGRATULATIONS YOU ARE A SUPERNERD!