Copy the introductory example. The vector name stores the extracted names
library(stringr)
raw.data <- paste("555-1239Moe Szyslak(636) 555-0113Burns, C. ",
"Montgomery555 -6542Rev. Timothy Lovejoy555 8904Ned ",
"Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert")
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
#Some names contain Abbs such as C for Charles.
#Lets remove them
name2 <- sub(" [A-z]{1}\\. ","",name)
name2
## [1] "Moe Szyslak" "Burns, Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
#Lets now remove titles from the names
name3 <- sub("[A-z]{2,3}\\. ","",name2)
name3
## [1] "Moe Szyslak" "Burns, Montgomery" "Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Julius Hibbert"
#We can use the fact that the names that need to be switched have a comma,
#we can flip names over the entries with the comma
name4 <- sub("(\\w+),\\s(\\w+)","\\2 \\1", name3)
name4
## [1] "Moe Szyslak" "Montgomery Burns" "Timothy Lovejoy"
## [4] "Ned Flanders" "Homer Simpson" "Julius Hibbert"
#Now as a data frame
df.names <- data.frame(name4)
df.names
## name4
## 1 Moe Szyslak
## 2 Montgomery Burns
## 3 Timothy Lovejoy
## 4 Ned Flanders
## 5 Homer Simpson
## 6 Julius Hibbert
#Recall the original sample 'name2' from part a
title_vector <- str_detect(name2, "[[:alpha:]]{2,}\\.")
title_vector
## [1] FALSE FALSE TRUE FALSE FALSE TRUE
#It can be seen easier as a data frame
df.titlevector <- data.frame(name2,title_vector)
df.titlevector
## name2 title_vector
## 1 Moe Szyslak FALSE
## 2 Burns, Montgomery FALSE
## 3 Rev. Timothy Lovejoy TRUE
## 4 Ned Flanders FALSE
## 5 Simpson, Homer FALSE
## 6 Dr. Julius Hibbert TRUE
We have correctly identifed who has a title or not using TRUE/FALSE statements
secondname <- str_detect(name," [A-z]{1}\\. ")
df.secondname <- data.frame(name,secondname)
df.secondname
## name secondname
## 1 Moe Szyslak FALSE
## 2 Burns, C. Montgomery TRUE
## 3 Rev. Timothy Lovejoy FALSE
## 4 Ned Flanders FALSE
## 5 Simpson, Homer FALSE
## 6 Dr. Julius Hibbert FALSE
We have correctly identifed who has a second name
# [0-9]+\\$
#\\b[a-z]{1,4}\\b
# .*?\\.txt$
#\\d{2}/\\d{2}/\\d{4}
#<(.+?)>.+?</\\1>
#A digit or more followed by the dollar $
pattern1<-"[0-9]+\\$"
sample1=c("1837648", "hy45$", "55$")
str_detect(sample1, pattern1)
## [1] FALSE TRUE TRUE
#Any word that has anywhere between 1 to 4 letters end with word boundary
pattern2<-"\\b[a-z]{1,4}\\b"
sample2<-c("cat","dogs","birds", "the", "abc is 123 is cool")
str_detect(sample2,pattern2)
## [1] TRUE TRUE FALSE TRUE TRUE
#A string that ends with a .txt and then new line or end line
pattern3<-".*?\\.txt$"
sample3<-c("hello.txt", "txt", "pets.txt","hd73tdh.txt")
str_detect(sample3,pattern3)
## [1] TRUE FALSE TRUE TRUE
#Numbers that are written in format nn/nn/nnn
pattern4='\\d{2}/\\d{2}/\\d{4}'
sample4<-c("109/8473/9848", "33/33/3333", "5/4/3")
str_detect(sample4, pattern4)
## [1] FALSE TRUE FALSE
#Select one or more character inside brakcets <>
#Followed by one or more character
#Followed by the same character inside the previous <> with a / in front (html format)
pattern5<-"<(.+?)>.+?</\\1>"
sample5<-c("<bob>hello</bob>", "<bob>hello<bob>")
str_detect(sample5,pattern5)
## [1] TRUE FALSE
secret_code <-"clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
#lets extract upper case letters
secret_code2 <- unlist(str_extract_all(secret_code, "[[:upper:].]{1,}"))
secret_code2
## [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "AT" "I" "O" "N" "S"
## [15] "." "Y" "O" "U" "." "A" "R" "E" "." "A" ".S" "U" "P" "E"
## [29] "R" "N" "E" "R" "D"
#I can see the message but it can be massaged to look like standard text that anyone could read
secret_code3 <- str_replace_all(paste(secret_code2, collapse = ''), "[.]", " ")
secret_code3
## [1] "CONGRATULATIONS YOU ARE A SUPERNERD"