library("stringr")
## Warning: package 'stringr' was built under R version 3.4.1
# Load the raw data while we are at it.
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
raw.data
## [1] "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
Copy the introductory example. The vector name stores the extracted names.
names <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
names
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
The first regex I use “,.+$” identifies a string of one or more characters between a comma and the end of the string. This is to pull Monte Burns and Homer Simpson. Next is to extract a string of one or more characters between a period and a space, “\..+”, this is for Rev. Lovejoy and Dr. Hibbert. Finally for Moe and Ned, the regex extracts exactly 3 characters between the start of the string and a space with no period, “^.[^\\.]{2}”.
For the last names there were two patterns, the last name ended in a comma in the case of Homer and Monte, “.+,” or the last name was 7 or 8 characters between a space and the end of the string for everyone else, “[[:blank:]].{7,8}”.
I used for loops to take out unwanted spaces and punctuation marks.
first_name <- str_extract_all(names, ",.+$|\\..+ |^.[^\\.]{2} ")
# Clean up the data a little by getting rid of punctuation and leading spaces
for(i in 1:length(first_name)){
first_name[i] = gsub(", ", "", first_name[i])
first_name[i] = gsub("\\. ", "", first_name[i])
first_name[i] = gsub(" ", "", first_name[i])
# Instead of dropping the C. from Monte's name. I opted to keep it in, as I have a friend that does this on his social media and uses it as his pen name.
first_name[i] = gsub("CM", "C. M", first_name[i])
}
last_name <- str_extract_all(names, ".+,|[ [:blank:]].{7,8}$")
for(i in 1:length(last_name)){
last_name[i] = gsub(",", "", last_name[i])
}
names_df <-data.frame(name = paste(first_name,last_name))
names_df
## name
## 1 Moe Szyslak
## 2 C. Montgomery Burns
## 3 Timothy Lovejoy
## 4 Ned Flanders
## 5 Homer Simpson
## 6 Julius Hibbert
The titles are either 2 or 3 letters long and are listed first, so I used srt_detect to start at the start of the string and look for three or 2 characters between a period, “^.{3}\.”.|^.{2}\.“.
title <- str_detect(names, "^.{3}\\.|^.{2}\\.")
names_df <- cbind(names_df, title = title)
names_df
## name title
## 1 Moe Szyslak FALSE
## 2 C. Montgomery Burns FALSE
## 3 Timothy Lovejoy TRUE
## 4 Ned Flanders FALSE
## 5 Homer Simpson FALSE
## 6 Julius Hibbert TRUE
What makes Monte Burns different is his name is the only one with a comma and a period, so I used str_detect to look for strings with at least one character between a comma and period “,.+\.”.
second <- str_detect(names, ",.+\\.")
names_df <- cbind(names_df, scnd_nm = second)
names_df
## name title scnd_nm
## 1 Moe Szyslak FALSE FALSE
## 2 C. Montgomery Burns FALSE TRUE
## 3 Timothy Lovejoy TRUE FALSE
## 4 Ned Flanders FALSE FALSE
## 5 Homer Simpson FALSE FALSE
## 6 Julius Hibbert TRUE FALSE
test_list <- as.character(c("5$" , "$5", "five$", "7,134,689$","at", "dab", "dAb" , "DaB" ,"band", "banana", "homework", 'homework.doc', "homework.txt", "txt", ".txt" , "7/4/76", "07/04/76", "07/04/1776", "07-04-1776", "<link>rpubs.com</link>", "<bold>Got It!</link>"))
test_list
## [1] "5$" "$5"
## [3] "five$" "7,134,689$"
## [5] "at" "dab"
## [7] "dAb" "DaB"
## [9] "band" "banana"
## [11] "homework" "homework.doc"
## [13] "homework.txt" "txt"
## [15] ".txt" "7/4/76"
## [17] "07/04/76" "07/04/1776"
## [19] "07-04-1776" "<link>rpubs.com</link>"
## [21] "<bold>Got It!</link>"
dollar <- str_detect(test_list, "[0-9]+\\$")
dollar <- cbind(test_list,dollar)
dollar
## test_list dollar
## [1,] "5$" "TRUE"
## [2,] "$5" "FALSE"
## [3,] "five$" "FALSE"
## [4,] "7,134,689$" "TRUE"
## [5,] "at" "FALSE"
## [6,] "dab" "FALSE"
## [7,] "dAb" "FALSE"
## [8,] "DaB" "FALSE"
## [9,] "band" "FALSE"
## [10,] "banana" "FALSE"
## [11,] "homework" "FALSE"
## [12,] "homework.doc" "FALSE"
## [13,] "homework.txt" "FALSE"
## [14,] "txt" "FALSE"
## [15,] ".txt" "FALSE"
## [16,] "7/4/76" "FALSE"
## [17,] "07/04/76" "FALSE"
## [18,] "07/04/1776" "FALSE"
## [19,] "07-04-1776" "FALSE"
## [20,] "<link>rpubs.com</link>" "FALSE"
## [21,] "<bold>Got It!</link>" "FALSE"
words <- str_detect(test_list, "\\b[a-z]{1,4}\\b")
words <- cbind(test_list,words)
words
## test_list words
## [1,] "5$" "FALSE"
## [2,] "$5" "FALSE"
## [3,] "five$" "TRUE"
## [4,] "7,134,689$" "FALSE"
## [5,] "at" "TRUE"
## [6,] "dab" "TRUE"
## [7,] "dAb" "FALSE"
## [8,] "DaB" "FALSE"
## [9,] "band" "TRUE"
## [10,] "banana" "FALSE"
## [11,] "homework" "FALSE"
## [12,] "homework.doc" "TRUE"
## [13,] "homework.txt" "TRUE"
## [14,] "txt" "TRUE"
## [15,] ".txt" "TRUE"
## [16,] "7/4/76" "FALSE"
## [17,] "07/04/76" "FALSE"
## [18,] "07/04/1776" "FALSE"
## [19,] "07-04-1776" "FALSE"
## [20,] "<link>rpubs.com</link>" "TRUE"
## [21,] "<bold>Got It!</link>" "TRUE"
txt <- str_detect(test_list, ".*?\\.txt?")
txt <- cbind(test_list,txt)
txt
## test_list txt
## [1,] "5$" "FALSE"
## [2,] "$5" "FALSE"
## [3,] "five$" "FALSE"
## [4,] "7,134,689$" "FALSE"
## [5,] "at" "FALSE"
## [6,] "dab" "FALSE"
## [7,] "dAb" "FALSE"
## [8,] "DaB" "FALSE"
## [9,] "band" "FALSE"
## [10,] "banana" "FALSE"
## [11,] "homework" "FALSE"
## [12,] "homework.doc" "FALSE"
## [13,] "homework.txt" "TRUE"
## [14,] "txt" "FALSE"
## [15,] ".txt" "TRUE"
## [16,] "7/4/76" "FALSE"
## [17,] "07/04/76" "FALSE"
## [18,] "07/04/1776" "FALSE"
## [19,] "07-04-1776" "FALSE"
## [20,] "<link>rpubs.com</link>" "FALSE"
## [21,] "<bold>Got It!</link>" "FALSE"
dates <- str_detect(test_list, "\\d{2}/\\d{2}/\\d{4}")
dates <- cbind(test_list,dates)
dates
## test_list dates
## [1,] "5$" "FALSE"
## [2,] "$5" "FALSE"
## [3,] "five$" "FALSE"
## [4,] "7,134,689$" "FALSE"
## [5,] "at" "FALSE"
## [6,] "dab" "FALSE"
## [7,] "dAb" "FALSE"
## [8,] "DaB" "FALSE"
## [9,] "band" "FALSE"
## [10,] "banana" "FALSE"
## [11,] "homework" "FALSE"
## [12,] "homework.doc" "FALSE"
## [13,] "homework.txt" "FALSE"
## [14,] "txt" "FALSE"
## [15,] ".txt" "FALSE"
## [16,] "7/4/76" "FALSE"
## [17,] "07/04/76" "FALSE"
## [18,] "07/04/1776" "TRUE"
## [19,] "07-04-1776" "FALSE"
## [20,] "<link>rpubs.com</link>" "FALSE"
## [21,] "<bold>Got It!</link>" "FALSE"
html <- str_detect(test_list, "<(.+?)>.+?</\\1>")
html <- cbind(test_list,html)
html
## test_list html
## [1,] "5$" "FALSE"
## [2,] "$5" "FALSE"
## [3,] "five$" "FALSE"
## [4,] "7,134,689$" "FALSE"
## [5,] "at" "FALSE"
## [6,] "dab" "FALSE"
## [7,] "dAb" "FALSE"
## [8,] "DaB" "FALSE"
## [9,] "band" "FALSE"
## [10,] "banana" "FALSE"
## [11,] "homework" "FALSE"
## [12,] "homework.doc" "FALSE"
## [13,] "homework.txt" "FALSE"
## [14,] "txt" "FALSE"
## [15,] ".txt" "FALSE"
## [16,] "7/4/76" "FALSE"
## [17,] "07/04/76" "FALSE"
## [18,] "07/04/1776" "FALSE"
## [19,] "07-04-1776" "FALSE"
## [20,] "<link>rpubs.com</link>" "TRUE"
## [21,] "<bold>Got It!</link>" "FALSE"