library(stringr)
raw.data<-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{10,}"))
name
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
#The above code is taken from textbook
First change the two reversed cases of Montgomery and Homer
name<-str_replace_all(name, "(.+)(, .+)$", "\\2 \\1")
name
## [1] "Moe Szyslak" ", C. Montgomery Burns" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" ", Homer Simpson" "Dr. Julius Hibbert"
Delete the commas in the text
name<-str_replace_all(name, ", ", "")
name
## [1] "Moe Szyslak" "C. Montgomery Burns" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Homer Simpson" "Dr. Julius Hibbert"
Remove the titles
namefinal<-str_replace_all(name, "[A-Z][a-z]([a-z]?)\\.", "")
namefinal
## [1] "Moe Szyslak" "C. Montgomery Burns" " Timothy Lovejoy"
## [4] "Ned Flanders" "Homer Simpson" " Julius Hibbert"
b)Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.)
library("knitr")
data<-data.frame(name)
data$title<-str_detect(string=name, pattern = "\\w{2,}\\.")
data$title
## [1] FALSE FALSE TRUE FALSE FALSE TRUE
kable(data)
| name | title |
|---|---|
| Moe Szyslak | FALSE |
| C. Montgomery Burns | FALSE |
| Rev. Timothy Lovejoy | TRUE |
| Ned Flanders | FALSE |
| Homer Simpson | FALSE |
| Dr. Julius Hibbert | TRUE |
c)Construct a logical vector indicating whether a character has a second name.
data$Sname<-str_detect(string = name, pattern = "[A-Z]{1}\\.")
data$Sname
## [1] FALSE TRUE FALSE FALSE FALSE FALSE
kable(data)
| name | title | Sname |
|---|---|---|
| Moe Szyslak | FALSE | FALSE |
| C. Montgomery Burns | FALSE | TRUE |
| Rev. Timothy Lovejoy | TRUE | FALSE |
| Ned Flanders | FALSE | FALSE |
| Homer Simpson | FALSE | FALSE |
| Dr. Julius Hibbert | TRUE | FALSE |
a<-c("alphanumeric456", "396$dhyth", "jhah8724627hav$", "real453$estate")
grepl("[0-9]+\\$", a)
## [1] FALSE TRUE FALSE TRUE
#The pattern is a string that contains numbers ending with a $
unlist(str_extract_all(a, pattern = "[0-9]+\\$" ))
## [1] "396$" "453$"
b<-c("Samantha", "john", "apple", "banana5", "gre", "gRe")
grepl("\\b[a-z]{1,4}\\b$", b)
## [1] FALSE TRUE FALSE FALSE TRUE FALSE
#The pattern is for words not exceeding four characters in all lowercase.
unlist(str_extract_all(b, pattern = "\\b[a-z]{1,4}\\b" ))
## [1] "john" "gre"
c<-c("fox.txt", "alpha.txtjghh", "alptxt", "1234.txt")
grepl(".*?\\.txt$", c)
## [1] TRUE FALSE FALSE TRUE
#The pattern is for any string ending in .txt.
unlist(str_extract_all(c, pattern = ".*?\\.txt$"))
## [1] "fox.txt" "1234.txt"
d<-c("03/06/2019", "06/05/19", "04/03/2003", "4/5/98")
grepl("\\d{2}/\\d{2}/\\d{4}", d)
## [1] TRUE FALSE TRUE FALSE
#This one is for dates in the format xx/xx/xxxx
unlist(str_extract_all(d, pattern="\\d{2}/\\d{2}/\\d{4}"))
## [1] "03/06/2019" "04/03/2003"
e<-c("<html>between</html>", "<boy>girl</boy>", "<boy></boy>", "<girl> </girl>")
grepl("<(.+?)>.+?</\\1>", e)
## [1] TRUE TRUE FALSE TRUE
#the pattern is for <text>sthinbetween</text>. Even a space in between works as in <girl> </girl>.
unlist(str_extract_all(e, pattern="<(.+?)>.+?</\\1>"))
## [1] "<html>between</html>" "<boy>girl</boy>" "<girl> </girl>"
mystery<-"clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
Looks like the message in is caps, so I am removing the lowercase letters.
mystery1<-gsub(pattern="[[:lower:]]", replacement="", mystery)
mystery1
## [1] "C10877ON92G8R5A50TU7L803AT5I307O553N364S.11YO6U2.2A4R905E.A.SU65P17E2463R95896N594E9054R5D!"
Next I remove all the digits.
mystery2<-gsub(pattern="[[:digit:]]", replacement="", mystery1)
mystery2
## [1] "CONGRATULATIONS.YOU.ARE.A.SUPERNERD!"
Now I have to remove the “.” between the words
mystery_solved<-str_replace_all(mystery2, "\\.", " ")
print(mystery_solved, quote=FALSE)
## [1] CONGRATULATIONS YOU ARE A SUPERNERD!