Our final answer is presented first, followed by the code to accomplish it.
## First_Name Last_Name Second_Name_Boolean Title_Boolean Title
## 1 Moe Szyslak No No
## 2 C. Montgomery Burns Yes No
## 3 Rev Timothy Lovejoy No Yes Rev
## 4 Ned Flanders No No
## 5 Homer Simpson No No
## 6 Dr Julius Hibbert No Yes Dr
library(stringr)
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
new.data<-data.frame("First_Name" = integer(15),"Last_Name" = integer(15),"Second_Name_Boolean" = integer(15),"Title_Boolean" = integer(15),"Title" = integer(15))
#Strip Names and initials in order.
split.string<-raw.data
next.split<-split.string
for (i in 1:15) {
extracter<-as.character(unlist(str_extract(next.split, "[A-Z][.]|[A-Z]{1}[[:alpha:]]{1,9}\\,?")))
new.data[[i,1]]<-extracter
split.string<-unlist(str_split_fixed(next.split, "[A-Z][.]|[A-Z]{1}[[:alpha:]]{1,9}\\,?",2))
next.split<-split.string[[2]]
}
#Append all initial initials to full middles names.
for(i in 1:15){
if (!is.na(unlist(str_extract(new.data[[i,1]], "[.]")))) {
new.data[[i,1]]<-paste(new.data[[i,1]],new.data[[i+1,1]], sep=" ")
new.data[[i,3]]<-"Yes"
for(j in i:13) {
new.data[[j+1,1]]<-new.data[[j+2,1]]
}
new.data[[15,1]]<-NA
} else{ new.data[[i,3]]<-"No"}
}
#Move titles to title place
for(i in 1:12){
if (unlist(str_detect(new.data[[i,1]],"Dr|Rev"))=="TRUE") {
new.data[[i,5]]<-new.data[[i,1]]
new.data[[i,1]]<-paste(new.data[[i,1]],new.data[[i+1,1]], sep=" ")
for(j in i:13) {
new.data[[j+1,1]]<-new.data[[j+2,1]]
}
new.data[[15,1]]<-NA
new.data[[i,4]]<-"Yes"
} else{ new.data[[i,4]]<-"No"}
}
#Switch any name that is last name first and put names in first and last column
for (i in seq(1,11, by=2)){
if (unlist(str_detect(new.data[[i,1]],","))=="TRUE") {
new.data[[i,1]]<-unlist(str_extract(new.data[[i,1]], "[:alpha:]{1,15}"))
new.data[[i,2]]<-new.data[[i,1]]
new.data[[i,1]]<-new.data[[i+1,1]]
new.data[[i,3]]<-new.data[[i+1,3]]
new.data[[i,4]]<-new.data[[i+1,4]]
new.data[[i,5]]<-new.data[[i+1,5]]
} else {
new.data[[i,2]]<-new.data[[i+1,1]]
}
}
final.answer<-data.frame("First_Name" = integer(6),"Last_Name" = integer(6),"Second_Name_Boolean" = integer(6),"Title_Boolean" = integer(6),"Title" = integer(6))
for (i in seq(1,11, by=2)){
for(j in 1:5) {
new.row<-ceiling(i/2)
final.answer[[new.row,j]]<-new.data[[i,j]]
}
}
final.answer
The first code is to find numbers followed by dollar signs.
example1<-"990$ gog akd kkd2$5$ %%%5$"
extraction1<-unlist(str_extract_all(example1, "[0-9]+\\$"))
extraction1
## [1] "990$" "2$" "5$" "5$"
The second code finds words that are all lower case and between 1 and 4 letters.
example2<-"I am for.. each panda Now."
extraction2<-unlist(str_extract_all(example2, "\\b[a-z]{1,4}\\b"))
extraction2
## [1] "am" "for" "each"
The third code is to find text files and will find anything ending in .txt.
example3<-"I am.txt$ for.. each panda Now.txt"
extraction3<-unlist(str_extract_all(example3, ".*?\\.txt$"))
extraction3
## [1] "I am.txt$ for.. each panda Now.txt"
example4<-".txt"
extraction4<-unlist(str_extract_all(example4, ".*?\\.txt$"))
extraction4
## [1] ".txt"
The fourth code is to find dates in mm/dd/yyyy.
example5<-"05/26/2004 3/17/1980 07/16/2000"
extraction5<-unlist(str_extract_all(example5, "\\d{2}/\\d{2}/\\d{4}"))
extraction5
## [1] "05/26/2004" "07/16/2000"
The last code finds html codes, like the color code around this statement.
example6<-"<fred>aa aa</fred> ate </fred"
extraction6<-unlist(str_extract_all(example6, "<(.+?)>.+?</\\1>"))
extraction6
## [1] "<fred>aa aa</fred>"