Create data

raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"

1. Conform to standard first_name last_name

Extract full name

name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name
## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

Remove Middle Name and title.

name_1 = str_replace(name, "[A-z]{1,3}\\. ", "")
name_1
## [1] "Moe Szyslak"       "Burns, Montgomery" "Timothy Lovejoy"  
## [4] "Ned Flanders"      "Simpson, Homer"    "Julius Hibbert"

Switch first last name position if there is comma in name. and remove comma

name_2=str_replace(name_1,"(\\w+),\\s(\\w+)","\\2 \\1")
name_2
## [1] "Moe Szyslak"      "Montgomery Burns" "Timothy Lovejoy" 
## [4] "Ned Flanders"     "Homer Simpson"    "Julius Hibbert"

Extract First name and last name

first_last_name = data.frame(t(sapply(strsplit(name_2," "),head,2)))
names(first_last_name) = c('first','last')
first_last_name
##        first     last
## 1        Moe  Szyslak
## 2 Montgomery    Burns
## 3    Timothy  Lovejoy
## 4        Ned Flanders
## 5      Homer  Simpson
## 6     Julius  Hibbert

2. CWhether a character has a title

name
## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"
title  =str_detect(name,"^[A-z]{1,3}\\. ") ### 1 to 3 alphabet positioning at the start of a string followed by '. '.
cbind(name,title)
##      name                   title  
## [1,] "Moe Szyslak"          "FALSE"
## [2,] "Burns, C. Montgomery" "FALSE"
## [3,] "Rev. Timothy Lovejoy" "TRUE" 
## [4,] "Ned Flanders"         "FALSE"
## [5,] "Simpson, Homer"       "FALSE"
## [6,] "Dr. Julius Hibbert"   "TRUE"

3.whether a character has a second name

sec_name = str_detect(name,"\\s[A-z]\\. ") ### ' ' +second_name +'. '
cbind(name,sec_name)
##      name                   sec_name
## [1,] "Moe Szyslak"          "FALSE" 
## [2,] "Burns, C. Montgomery" "TRUE"  
## [3,] "Rev. Timothy Lovejoy" "FALSE" 
## [4,] "Ned Flanders"         "FALSE" 
## [5,] "Simpson, Homer"       "FALSE" 
## [6,] "Dr. Julius Hibbert"   "FALSE"

4. Describe the types of string that conform to the folloing regular expressions and construct an example that is matched by he regular expression

4.1 [0-9]+\$ : one or more digit + $ sign

regexp_4_1 = '[0-9]+\\$'
test_4_1 = 'abs1111$0110100101 '
str_extract_all(test_4_1,regexp_4_1)
## [[1]]
## [1] "1111$"

4.2 \b[a-z]{1,4}\b: word edge + lower case any alphabet from a to z (frequencing from 1 time to 4 times) + word edge

regexp_4_2 = "\\b[a-z]{1,4}\\b"
test_4_2 = 'cadfc abw'
str_extract_all(test_4_2,regexp_4_2)
## [[1]]
## [1] "abw"

4.3 .?.txt$ : . means any character means zero or more time ? once or more times ’' . a period + txt at the end of the string

regexp_4_3 = ".*?\\.txt$"
test_4_3 = '.\test.txt'
str_extract_all(test_4_3,regexp_4_3)
## [[1]]
## [1] ".\test.txt"

4.4 \d{2}/\d{2}/\d{4} : 2 digits / 2 digits / 4 digits

regexp_4_4 = "\\d{2}/\\d{2}/\\d{4}"
test_4_4 = '11/11/1111'
str_extract_all(test_4_4,regexp_4_4)
## [[1]]
## [1] "11/11/1111"

4.5 <(.+?)>.+?</>

print('<any character> any character </contents of group 1>')
## [1] "<any character> any character </contents of group 1>"
regexp_4_5 = "<(.+?)>.+?</\\1>"
test_4_5 = '<span>20 flips come up heads</span>'
str_extract_all(test_4_5,regexp_4_5)
## [[1]]
## [1] "<span>20 flips come up heads</span>"

Congratulations you are a super nerd

str_9 = "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
regexp_9 = "[A-Z]+"
str_extract_all(str_9, regexp_9)
## [[1]]
##  [1] "C"  "O"  "N"  "G"  "R"  "A"  "T"  "U"  "L"  "AT" "I"  "O"  "N"  "S" 
## [15] "Y"  "O"  "U"  "A"  "R"  "E"  "A"  "S"  "U"  "P"  "E"  "R"  "N"  "E" 
## [29] "R"  "D"