required Library

library(stringr)

3. Copy the introductory example. The vector name stores the extracted names.

Load Data

raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name
## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

(a)Use the tools of this chapter to rearrnge the vector so that all elements conform to the standard first_name last_name’

Remove Title

noTitle <- unlist(str_replace(name,"[[:alpha:]]{2,}[.]",""))
noTitle
## [1] "Moe Szyslak"          "Burns, C. Montgomery" " Timothy Lovejoy"    
## [4] "Ned Flanders"         "Simpson, Homer"       " Julius Hibbert"

Remove Second Name

noSecName <- unlist(str_replace(noTitle,"[[:alpha:]]{1}[.]",""))
noSecName
## [1] "Moe Szyslak"        "Burns,  Montgomery" " Timothy Lovejoy"  
## [4] "Ned Flanders"       "Simpson, Homer"     " Julius Hibbert"

Extracting First Name

firstName <- unlist(str_extract(noSecName,"[[:alpha:]]{2,}[[:space:]]{1,}|[[:punct:]][[:space:]]{1,}[[:alpha:]]{2,}"))
firstName <- unlist(str_replace(firstName,"[[:punct:]][[:space:]]",""))
firstName
## [1] "Moe "        " Montgomery" "Timothy "    "Ned "        "Homer"      
## [6] "Julius "

Extracting Last Name

lastName <- unlist(str_extract(noSecName,"[^[:punct:]][[:space:]][[:alpha:]]{2,}|[[:alpha:]]{2,}[[:punct:]]"))
lastName <- unlist(str_replace(lastName,"[[:alpha:]][[:space:]]", ""))
lastName <- unlist(str_replace(lastName,"[[:punct:]]", ""))
lastName
## [1] "Szyslak"  "Burns"    "Lovejoy"  "Flanders" "Simpson"  "Hibbert"

For first names and last names I was having trobule with mixing characters and punctuations, used str_replace function to clean up the data.

Rearrange

paste(firstName, lastName)
## [1] "Moe  Szyslak"      " Montgomery Burns" "Timothy  Lovejoy" 
## [4] "Ned  Flanders"     "Homer Simpson"     "Julius  Hibbert"
data.frame(firstName, lastName)
##     firstName lastName
## 1        Moe   Szyslak
## 2  Montgomery    Burns
## 3    Timothy   Lovejoy
## 4        Ned  Flanders
## 5       Homer  Simpson
## 6     Julius   Hibbert

(b)Construct a logical vector indicating whether a character has a title(i.e., Rev. and Dr.).

Title has more than 2 alphabet and ends with“.”

title <- unlist(str_detect(name,"[[:alpha:]]{2,}[.]"))
title
## [1] FALSE FALSE  TRUE FALSE FALSE  TRUE
df1 <- data.frame(name,title)
df1
##                   name title
## 1          Moe Szyslak FALSE
## 2 Burns, C. Montgomery FALSE
## 3 Rev. Timothy Lovejoy  TRUE
## 4         Ned Flanders FALSE
## 5       Simpson, Homer FALSE
## 6   Dr. Julius Hibbert  TRUE

(c)Construct a logical vector indicating whether a character has a second name.

Second Name

secName <- unlist(str_detect(noTitle,"[[:alpha:]]{1,}[.]"))
secName
## [1] FALSE  TRUE FALSE FALSE FALSE FALSE
df2 <- data.frame(name,secName)
df2
##                   name secName
## 1          Moe Szyslak   FALSE
## 2 Burns, C. Montgomery    TRUE
## 3 Rev. Timothy Lovejoy   FALSE
## 4         Ned Flanders   FALSE
## 5       Simpson, Homer   FALSE
## 6   Dr. Julius Hibbert   FALSE

Describe the types of strings that conform to the following regular expressions and construct and example that is matched by regular expression.

(a)[0-9]+\$

The above regular expression looking for numbers 0 to and ends with $.

sol1 <- c("124$", "$124", "a2b$", "16$cb")
sol1 <- unlist(str_detect(sol1, "[0-9]+\\$"))
sol1
## [1]  TRUE FALSE FALSE  TRUE

(b)\b[a-z]{1,4}\b

This regular expression will extract data containing blank followed by 1 to four lower case letters followed by blank.

sol2 <- c("abcdf", "ghij", "KLMN", "6542", "opqr 23")
sol2 <- unlist(str_extract(sol2, "\\b[a-z]{1,4}\\b"))
sol2
## [1] NA     "ghij" NA     NA     "opqr"

(c).*?\.txt$ ##This regular expression will look for files ends in .txt

sol3 <- c("ahs.rtxt", "123asd.txt", "txt.r")
sol3 <- unlist(str_extract(sol3, ".*?\\.txt$"))
sol3
## [1] NA           "123asd.txt" NA

(d)\d{2}/\d{2}\d{4}

This regular expression is looking for two numbers followed by a forward slash followed by two numbers followed by a forward slash followed by 4 numbers, most likely a date format.

sol4 <- c("09/24/1979", "mm/dd/yyyy", "23/sept/1979")
sol4 <- unlist(str_extract(sol4, "\\d{2}/\\d{2}/\\d{4}"))
sol4
## [1] "09/24/1979" NA           NA

(e)<(.+?)>.+?</\1>

This regular expression will look for html tags

sol5 <- c("<tag>cuny msda</tag>", "<tag>Assingnment2<tag>", "www.cuny.com")
sol5 <- unlist(str_extract(sol5, "<(.+?)>.+?</\\1>"))
sol5
## [1] "<tag>cuny msda</tag>" NA                     NA

9.The following code hides a secret message. Crack it with R and regular expressions.

Load Hidden message

hMessage <- c("clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO
d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5
fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr")

Upper cases are more revealing than the others and “.” used as word sepatators

Extract characters

message <- unlist(str_extract_all(hMessage, "[[:upper:].]{1,}"))
message
##  [1] "C"  "O"  "N"  "G"  "R"  "A"  "T"  "U"  "L"  "AT" "I"  "O"  "N"  "S" 
## [15] "."  "Y"  "O"  "U"  "."  "A"  "R"  "E"  "."  "A"  ".S" "U"  "P"  "E" 
## [29] "R"  "N"  "E"  "R"  "D"

Join and Replace “.” with space

message <- str_replace_all(paste(message, collapse = ''), "[.]", " ")
message
## [1] "CONGRATULATIONS YOU ARE A SUPERNERD"