a. Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.
library(stringr)
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
#extract raw data to get names
names <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
names
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
#extract data to get phone number
phone <- unlist(str_extract_all(raw.data,"\\(?(\\d{3})?\\)?(-| )?\\d{3}(-| )?\\d{4}"))
phone
## [1] "555-1239" "(636) 555-0113" "555-6542" "555 8904"
## [5] "636-555-3226" "5553642"
#To remove Title and Middle initial
temp_name<- unlist(str_replace_all(names, "[[:alpha:]]+\\. ", ""))
#To Remove Punctuation in name
Full_name<-str_replace_all(temp_name, "[[:punct:]]", "")
Full_name
## [1] "Moe Szyslak" "Burns Montgomery" "Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson Homer" "Julius Hibbert"
I am tring to separate First_name and Last_name into different columns
#Split Full name
temp <- strsplit(Full_name, " ")
#Create Matrix
mat <- matrix(unlist(temp), ncol=2, byrow=TRUE)
#convert Matrix into Data.Frame
df <- as.data.frame(mat)
#combine with phone number
df <- cbind(df, phone)
#give valid name to column name
colnames(df) <- c("First_name", "Last_name", "Phone")
df
## First_name Last_name Phone
## 1 Moe Szyslak 555-1239
## 2 Burns Montgomery (636) 555-0113
## 3 Timothy Lovejoy 555-6542
## 4 Ned Flanders 555 8904
## 5 Simpson Homer 636-555-3226
## 6 Julius Hibbert 5553642
b. Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.)
title <- str_detect(names, "Rev.|Dr.")
title
## [1] FALSE FALSE TRUE FALSE FALSE TRUE
c. Construct a logical vector indicating whether a character has a second name.
initial <- str_replace(names, "[[:alpha:]]{2,3}[.]", "")
initial <- str_count(initial, "\\w+")
initial <- initial > 2;
initial
## [1] FALSE TRUE FALSE FALSE FALSE FALSE
1. [0-9]+\$
Numeric characters only (0 to 9), any length >0, has to end with a “$” sign.
str_detect("888888$", "[0-9]+\\$")
## [1] TRUE
2. \b[a-z]{1,4}\b
Vectors containing strings with maximum 4 continuous characters lowercase letters from a to z at the beginning and at the end.
str_detect("abcdefgh", "\\b[a-z]{1,4}\\b")
## [1] FALSE
3. .*?\.txt$
this will return any string with any length that ends with ‘.txt’
str_detect("Sample1.txt", ".*?\\.txt$")
## [1] TRUE
4.\d{2}/\d{2}/\d{4}
Has to contain a string with: 2 digits, followed by string “/”, followed by 2 digits, followed by string “/”, and then followed by 4 digits. most likely date format e.g dd/mm/yyyy or mm/dd/yyyy.
str_detect("16/09/2018", "\\d{2}/\\d{2}/\\d{4}")
## [1] TRUE
5. <(.+?)>.+?</\1>
This will helps to backreferencing to return any string with in HTML Tags.
tag1 <- "<<h3>This is a heading</h3>
<p>This is a paragraph.</p>"
str_extract_all(tag1, '<(.+?)>.+?</\\1>')
## [[1]]
## [1] "<h3>This is a heading</h3>" "<p>This is a paragraph.</p>"
clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr
secret_message<- paste("clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr")
#get all uppercase letters.
message <- unlist(str_extract_all(secret_message, "[[:upper:].]{1,}"))
#join all letters.
message <- str_replace_all(paste(message, collapse = ''), "[.]", " ");
message
## [1] "CONGRATULATIONS YOU ARE A SUPERNERD"