#Raw Data
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
# Extract only full names and unlist them into a character
names <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
names
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
# Replace Dr., Rev., C. and any commas to get only first and last names
str_names_only <- str_replace_all(names, "Dr.\\s+|Rev.\\s+|C.\\s+|,", "")
# Split the String into a list of full names
str_names_list <- str_split(noquote(str_names_only), "\\W+")
noquote(str_names_list)
## [[1]]
## [1] Moe Szyslak
##
## [[2]]
## [1] Burns Montgomery
##
## [[3]]
## [1] Timothy Lovejoy
##
## [[4]]
## [1] Ned Flanders
##
## [[5]]
## [1] Simpson Homer
##
## [[6]]
## [1] Julius Hibbert
str_names_titles <- str_replace_all(names, "C.\\s+|,", "")
logical_vector_title <- c()
str_names_titles <- str_split(str_names_titles, '\\"')
for(full_title_name in str_names_titles) {
logical_vector_title <- c(logical_vector_title, str_detect(full_title_name, "Dr.|Rev."))
}
logical_vector_title
## [1] FALSE FALSE TRUE FALSE FALSE TRUE
str_names_second <- str_replace_all(names, "Dr.|Rev.|,", "")
logical_vector_secondName<- c()
str_names_second <- str_split(str_names_second, '\\"')
for(full_title_name in str_names_second) {
logical_vector_secondName <- c(logical_vector_secondName, str_detect(full_title_name, "\\."))
}
logical_vector_secondName
## [1] FALSE TRUE FALSE FALSE FALSE FALSE
# Replace Dr., Rev., C. and any commas to get only first and last names
str_names_only <- str_replace_all(names, "Dr.\\s+|Rev.\\s+|C.\\s+|,", "")
# Split the String into a list of full names
str_names_list <- str_split(noquote(str_names_only), "\\W+")
str_names_list
## [[1]]
## [1] "Moe" "Szyslak"
##
## [[2]]
## [1] "Burns" "Montgomery"
##
## [[3]]
## [1] "Timothy" "Lovejoy"
##
## [[4]]
## [1] "Ned" "Flanders"
##
## [[5]]
## [1] "Simpson" "Homer"
##
## [[6]]
## [1] "Julius" "Hibbert"
#Create two empty vectors to hold first and last names
first_names_vector <- c()
last_names_vector <- c()
#Use a for loop to iterate over the List of full names and extract first and last names into vectors
for (full_name in str_names_list) {
first_names_vector <- c(first_names_vector, full_name[1])
last_names_vector <- c(last_names_vector, full_name[2])
}
first_names_vector
## [1] "Moe" "Burns" "Timothy" "Ned" "Simpson" "Julius"
last_names_vector
## [1] "Szyslak" "Montgomery" "Lovejoy" "Flanders" "Homer"
## [6] "Hibbert"
Below regular expression will match numbers ending with a $ sign
sample_string <- "220$ is the price of this jacket bought on date 10/12/2018"
str_extract_all(sample_string, "[0-9]+\\$")
## [[1]]
## [1] "220$"
Below regular expression will match words starting with small alphabets with a count of letters b/w 1 to 4 ending with a $ sign
sample_string <- "220$ is the price of this jacket bought on date 10/12/2018."
str_extract_all(sample_string, "\\b[a-z]{1,4}\\b")
## [[1]]
## [1] "is" "the" "of" "this" "on" "date"
# Below regular expression will match words having any or no characters and ending with a .txt
sample_text_names <- c('filename', 'file.txt', 'file.xml')
str_extract_all(sample_text_names, ".*?\\.txt$")
## [[1]]
## character(0)
##
## [[2]]
## [1] "file.txt"
##
## [[3]]
## character(0)
Below regular expression will match numbers in the format of dd/dd/dddd usually a date format
sample_string <- "220$ is the price of this jacket bought on date 10/12/2018"
str_extract_all(sample_string, " \\d{2}/\\d{2}/\\d{4}")
## [[1]]
## [1] " 10/12/2018"
Below Regular expression will match a literal ‘<’ followed by any chars until > having an optional chars before matching ‘</’ followed a match which is a back reference to first match. Usualy used to extract xml tags and values
sample_string <- "This text having <sometag>value</sometag> is a sample xml tag"
str_extract_all(sample_string, "<(.+?)>.+?</\\1>")
## [[1]]
## [1] "<sometag>value</sometag>"
When tested using upper predefined character class following message reveals ‘CONGRATULATIONSYOURARESUPERNERD’
encrypted <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
unencrypted <- unlist(str_extract_all(encrypted, "[:upper:]"))
unencrypted
## [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "Y" "O"
## [18] "U" "A" "R" "E" "A" "S" "U" "P" "E" "R" "N" "E" "R" "D"