load the raw data into R
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
raw.data
## [1] "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
we use stringr library for data manipulation
library(stringr)
looking for alphabetic characters, also contain period,commas and empty spaces, and add a quantifier that at least length two.
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
find any character text from 2 to 3 character follow by “.”
title<-str_detect(name,"[\\w]{2,3}\\. ")
cbind(name,title)
## name title
## [1,] "Moe Szyslak" "FALSE"
## [2,] "Burns, C. Montgomery" "FALSE"
## [3,] "Rev. Timothy Lovejoy" "TRUE"
## [4,] "Ned Flanders" "FALSE"
## [5,] "Simpson, Homer" "FALSE"
## [6,] "Dr. Julius Hibbert" "TRUE"
sec_name = str_detect(name,"\\s[A-z]\\. ")
cbind(name,sec_name)
## name sec_name
## [1,] "Moe Szyslak" "FALSE"
## [2,] "Burns, C. Montgomery" "TRUE"
## [3,] "Rev. Timothy Lovejoy" "FALSE"
## [4,] "Ned Flanders" "FALSE"
## [5,] "Simpson, Homer" "FALSE"
## [6,] "Dr. Julius Hibbert" "FALSE"
One or more digit numbber and follow by “$”sign
sample1 = 'abc123$123'
a<-str_extract_all(sample1,'[0-9]+\\$')
a
## [[1]]
## [1] "123$"
word edge and charater letter with length 1 to 4 and word edge
example2 = 'abcde abc'
str_extract_all(example2,"\\b[a-z]{1,4}\\b")
## [[1]]
## [1] "abc"
example3 ='.local.txt'
str_extract_all(example3,".*?\\.txt$")
## [[1]]
## [1] ".local.txt"
Any digital with length2/any digital with length 2/ any digital with length of 4.
example4= '02/17/2019'
str_extract_all(example4,"\\d{2}/\\d{2}/\\d{4}")
## [[1]]
## [1] "02/17/2019"
‘
example5 = '<any>anything</any>'
str_extract_all(example5,"<(.+?)>.+?</\\1>")
## [[1]]
## [1] "<any>anything</any>"