Create Raw data

#Raw Data
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"

# Extract only full names and unlist them into a character
names <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}")) 
names
## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

Questions

1. First and Last name

# Replace Dr., Rev., C. and any commas to get only first and last names
str_names_only <- str_replace_all(names, "Dr.\\s+|Rev.\\s+|C.\\s+|,", "")

# Split the String into a list of full names
str_names_list <- str_split(noquote(str_names_only), "\\W+")
noquote(str_names_list)
## [[1]]
## [1] Moe     Szyslak
## 
## [[2]]
## [1] Burns      Montgomery
## 
## [[3]]
## [1] Timothy Lovejoy
## 
## [[4]]
## [1] Ned      Flanders
## 
## [[5]]
## [1] Simpson Homer  
## 
## [[6]]
## [1] Julius  Hibbert

2. Has a title (i.e., Rev. and Dr.)

str_names_titles <- str_replace_all(names, "C.\\s+|,", "")
logical_vector_title <- c()

str_names_titles <- str_split(str_names_titles, '\\"')
for(full_title_name in str_names_titles) {
  logical_vector_title <- c(logical_vector_title, str_detect(full_title_name, "Dr.|Rev."))
}

logical_vector_title
## [1] FALSE FALSE  TRUE FALSE FALSE  TRUE

3. Has a second name

str_names_second <- str_replace_all(names, "Dr.|Rev.|,", "")
logical_vector_secondName<- c()

str_names_second <- str_split(str_names_second, '\\"')
for(full_title_name in str_names_second) {
  logical_vector_secondName <- c(logical_vector_secondName, str_detect(full_title_name, "\\."))
}

logical_vector_secondName
## [1] FALSE  TRUE FALSE FALSE FALSE FALSE

4. First and Last name vectors (Extra)

# Replace Dr., Rev., C. and any commas to get only first and last names
str_names_only <- str_replace_all(names, "Dr.\\s+|Rev.\\s+|C.\\s+|,", "")

# Split the String into a list of full names
str_names_list <- str_split(noquote(str_names_only), "\\W+")
str_names_list
## [[1]]
## [1] "Moe"     "Szyslak"
## 
## [[2]]
## [1] "Burns"      "Montgomery"
## 
## [[3]]
## [1] "Timothy" "Lovejoy"
## 
## [[4]]
## [1] "Ned"      "Flanders"
## 
## [[5]]
## [1] "Simpson" "Homer"  
## 
## [[6]]
## [1] "Julius"  "Hibbert"
#Create two empty vectors to hold first and last names
first_names_vector <- c()
last_names_vector <- c()

#Use a for loop to iterate over the List of full names and extract first and last names into vectors
for (full_name in str_names_list) { 
    first_names_vector <- c(first_names_vector, full_name[1])  
    last_names_vector <- c(last_names_vector, full_name[2])
}
first_names_vector
## [1] "Moe"     "Burns"   "Timothy" "Ned"     "Simpson" "Julius"
last_names_vector
## [1] "Szyslak"    "Montgomery" "Lovejoy"    "Flanders"   "Homer"     
## [6] "Hibbert"

Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.

1. [0-9]+\$

Below regular expression will match numbers ending with a $ sign

sample_string <- "220$ is the price of this jacket bought on date 10/12/2018"
str_extract_all(sample_string, "[0-9]+\\$")
## [[1]]
## [1] "220$"

2 \b[a-z]{1,4}\b

Below regular expression will match words starting with small alphabets with a count of letters b/w 1 to 4 ending with a $ sign

sample_string <- "220$ is the price of this jacket bought on date 10/12/2018."
str_extract_all(sample_string, "\\b[a-z]{1,4}\\b")
## [[1]]
## [1] "is"   "the"  "of"   "this" "on"   "date"

3 .*?\.txt$

# Below regular expression will match words having any or no characters and ending with a .txt
sample_text_names <- c('filename', 'file.txt', 'file.xml')
str_extract_all(sample_text_names, ".*?\\.txt$")
## [[1]]
## character(0)
## 
## [[2]]
## [1] "file.txt"
## 
## [[3]]
## character(0)

4 \d{2}/\d{2}/\d{4}

Below regular expression will match numbers in the format of dd/dd/dddd usually a date format

sample_string <- "220$ is the price of this jacket bought on date 10/12/2018"
str_extract_all(sample_string, " \\d{2}/\\d{2}/\\d{4}")
## [[1]]
## [1] " 10/12/2018"

5 <(.+?)>.+?</\1>

Below Regular expression will match a literal ‘<’ followed by any chars until > having an optional chars before matching ‘</’ followed a match which is a back reference to first match. Usualy used to extract xml tags and values

sample_string <- "This text having <sometag>value</sometag> is a sample xml tag"
str_extract_all(sample_string, "<(.+?)>.+?</\\1>")
## [[1]]
## [1] "<sometag>value</sometag>"

Decrypting String

When tested using upper predefined character class following message reveals ‘CONGRATULATIONSYOURARESUPERNERD’

encrypted <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"

unencrypted <- unlist(str_extract_all(encrypted, "[:upper:]"))
unencrypted
##  [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "Y" "O"
## [18] "U" "A" "R" "E" "A" "S" "U" "P" "E" "R" "N" "E" "R" "D"