Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.
Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).
Construct a logical vector indicating whether a character has a second name.
library(stringr)
library(XML)
library(RCurl)
## Loading required package: bitops
library(tau)
raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5543642Dr. Julius Hibbert"
# Extract information
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
phone <- unlist(str_extract_all(raw.data, "\\(?(\\d{3})?\\)?(-| )?\\d{3}(-| )?\\d{4}"))
phone
## [1] "555-1239" "(636) 555-0113" "555-6542" "555 8904"
## [5] "636-555-3226" "5543642"
data.frame(name = name, phone = phone)
names<-as.character(name)
names.title<-NULL
names.second<-NULL
i=1
#length(names)
for(i in seq_along(names)) {
names1<-names[i]
#names1 = "Burns, C. Montgomery"
# Check if name has a comma
if(is.na(str_locate(pattern=',', names1))[1]) {
# If not, Check if name has a period
if(is.na(str_locate(pattern='\\.', names1))[1]) {
# Name has neither comma nor period. Use as-is
# print(names1)
names[i]<-names1
names.title[i]<-FALSE
names.second[i]<-FALSE
} else {
# Name has no comma, but only period i.e. a title. Remove the portion upto the period and use the rest
names1=gsub(substr(names1, 1, str_locate(pattern='\\.', names1)+1),"",names1)
#print(names1)
names[i]<-names1
names.title[i]<-TRUE
names.second[i]<-FALSE
}} else {
# Name has comma. May or may not have a period
# split into first name and last name
lastname=as.character(substr(names1, 1, str_locate(pattern=',', names1)-1))
firstname=as.character(substr(names1, str_locate(pattern=',', names1)+2, stop=nchar(names1)))
# Check if firstname has a period
if(is.na(str_locate(pattern='\\.', firstname))[1]) {
# first name does not have a period. Combine first name and last name in the required order
names1<-paste(firstname,lastname)
#print(names1)
names[i]<-names1
names.title[i]<-FALSE
names.second[i]<-FALSE
} else {
# Name has both comma and period. Remove the period from the first name
firstname=as.character(substr(firstname, str_locate(pattern='\\.', firstname)+2,stop=nchar(firstname)))
names1<-paste(firstname,lastname)
#print(names1)
names[i]<-names1
names.title[i]<-FALSE
names.second[i]<-TRUE
}
}
}
names
## [1] "Moe Szyslak" "Montgomery Burns" "Timothy Lovejoy"
## [4] "Ned Flanders" "Homer Simpson" "Julius Hibbert"
cat("The logical vector indicating prescence of a title is:", names.title, "\n")
## The logical vector indicating prescence of a title is: FALSE FALSE TRUE FALSE FALSE TRUE
cat("The logical vector indicating prescence of a second name is:", names.second, "\n")
## The logical vector indicating prescence of a second name is: FALSE TRUE FALSE FALSE FALSE FALSE
# (firstname=substr(firstname, str_locate(pattern='\\.', firstname)+2,stop=nchar(firstname)))
#(lastname=substr(names1, 1, str_locate(pattern=',', names1)-1))
#str_locate(pattern='\\.', "Montgomery")
#str_extract("Burns, C. Montgomery",pattern=',')
#str_locate(pattern='\.', "Moe Szysak")
#str_detect(pattern='\.', "Moe Szysak")
#str_extract("Moe Szyslak", "[[:punct:]]")
#str_extract_all("Moe Szyslak", "[[:punct:]]")
#str_split(names, "[[:punct:]]")
#grep(".",names1)
#nchar("Moe Szyslak")
input<-c("8789$", "what", "abc22$", 56, "$")
unlist(str_extract_all(input,"[0-9]+\\$"))
## [1] "8789$" "22$"
input<-c("8789$", "what", "abc22$", 56, "$", "awesome", "34..awa")
unlist(str_extract_all(input,"\\b[a-z]{1,4}\\b"))
## [1] "what" "awa"
input<-c("8789$", "what", "abc22$", 56, "$", "awesome", "34..awa", ".txt", "23.txt")
unlist(str_extract_all(input,".*?\\.txt$"))
## [1] ".txt" "23.txt"
A string containing 2 digits followed by a slash followed by 2 digits followed by a slash followed by 4 digits. This is the standard date format dd/mm/yyyy or mm/dd/yyyy.
input<-"02/17/2019"
str_extract_all(input,"\\d{2}/\\d{2}/\\d{4}")
## [[1]]
## [1] "02/17/2019"
A string containing 0 or 1 character at the beginning followed by 0 or 1 character followed by a slash and then the character ‘1’ at the end
input<-c("8789$", "what", "abc22$", 56, "$", "awesome", "34..awa", ".txt", "23.txt", "/1", "ab/1", "\\")
unlist(str_extract_all(input,"<(.+?)>.+?</\\1>"))
## character(0)
string<-"clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo
Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO
d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5
fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
str_extract_all(string, pattern = "[A-Z]")
## [[1]]
## [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "Y" "O"
## [18] "U" "A" "R" "E" "A" "S" "U" "P" "E" "R" "N" "E" "R" "D"
CONGRATULATIONS YOU ARE A SUPER NERD