library('stringr')
## Warning: package 'stringr' was built under R version 3.4.4
raw.data = "555-1239Moe Szyslak(636) 555-0113 Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
name = unlist(str_extract_all(raw.data,"[[:alpha:] ,.]{2,}"))
This will do most of the work: find the commas and put the last names at the end if they find one:
name = paste(str_replace(name,'[[:alpha:]., ]+,',''), str_replace_na(str_extract(name,'[[:alpha:]]+,'),''))
name
## [1] "Moe Szyslak " " C. Montgomery Burns," "Rev. Timothy Lovejoy "
## [4] "Ned Flanders " " Homer Simpson," "Dr. Julius Hibbert "
Only issue now is some trailing spaces and commas
name = str_trim(str_replace(name, ',$',''))
name
## [1] "Moe Szyslak" "C. Montgomery Burns" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Homer Simpson" "Dr. Julius Hibbert"
common.titles = c('Dr', 'Esq', 'Hon', 'Jr', 'Mr', 'Mrs', 'Ms', 'Messrs', 'Mmes', 'Msgr', 'Prof', 'Rev', 'Rt Hon', 'Sr', 'St')
This should build a big ugly expression that looks for one of these titles at the beginning of the word followed by a “.”
title.expression = paste(paste('\\b',common.titles,'\\.',sep=""),collapse="|")
title.expression
## [1] "\\bDr\\.|\\bEsq\\.|\\bHon\\.|\\bJr\\.|\\bMr\\.|\\bMrs\\.|\\bMs\\.|\\bMessrs\\.|\\bMmes\\.|\\bMsgr\\.|\\bProf\\.|\\bRev\\.|\\bRt Hon\\.|\\bSr\\.|\\bSt\\."
Look for those titles, ignoring case
has.title = str_detect(name, regex(title.expression, ignore_case=TRUE))
cbind(name, has.title)
## name has.title
## [1,] "Moe Szyslak" "FALSE"
## [2,] "C. Montgomery Burns" "FALSE"
## [3,] "Rev. Timothy Lovejoy" "TRUE"
## [4,] "Ned Flanders" "FALSE"
## [5,] "Homer Simpson" "FALSE"
## [6,] "Dr. Julius Hibbert" "TRUE"
second.name <- str_count(name,'[[:alpha:]]+')-as.integer(has.title)>=3
cbind(name, second.name)
## name second.name
## [1,] "Moe Szyslak" "FALSE"
## [2,] "C. Montgomery Burns" "TRUE"
## [3,] "Rev. Timothy Lovejoy" "FALSE"
## [4,] "Ned Flanders" "FALSE"
## [5,] "Homer Simpson" "FALSE"
## [6,] "Dr. Julius Hibbert" "FALSE"
str_detect('12341234$', '[0-9]+\\$')
## [1] TRUE
str_detect('duck', '\\b[a-z]{1,4}\\b')
## [1] TRUE
str_detect('blah blah blah 123.txt', '.*?\\.txt$')
## [1] TRUE
str_detect('02/22/1985', '\\d{2}/\\d{2}/\\d{4}')
## [1] TRUE
str_detect('<class>Data 607</class>', '<(.+?)>.+?</\\1>')
## [1] TRUE
raw.message <- 'clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr'
Going to pull out a few different categories to see if I can make sense of it
unlist(str_extract_all(raw.message,'\\d+'))
## [1] "1" "0" "87" "7" "92" "8" "5" "5" "0" "7" "8"
## [12] "03" "5" "3" "0" "7" "55" "3" "3" "6" "4" "1"
## [23] "1" "6" "2" "2" "4" "9" "05" "65" "1" "7" "24"
## [34] "6" "3" "9" "5" "89" "6" "5" "9" "4" "905" "4"
## [45] "5"
unlist(str_extract_all(raw.message,'[a-z]+'))
## [1] "clcop" "ow" "zmstc" "d" "wnkig" "vdicp"
## [7] "uggvhryn" "juwczi" "hqrfp" "xs" "j" "dwpn"
## [13] "anwo" "wisdij" "j" "kpf" "dr" "coc"
## [19] "bt" "yczjat" "aootj" "t" "j" "ne"
## [25] "c" "fek" "r" "w" "wwojig" "d"
## [31] "vrf" "rbz" "bk" "nbhzgv" "i" "z"
## [37] "crop" "w" "gnb" "qo" "f" "a"
## [43] "otfb" "w" "m" "k" "t" "s"
## [49] "zqe" "fy" "n" "d" "t" "kc"
## [55] "f" "gmc" "gxo" "nh" "k" "gr"
unlist(str_extract_all(raw.message,'[A-Z]+'))
## [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "AT" "I" "O" "N" "S"
## [15] "Y" "O" "U" "A" "R" "E" "A" "S" "U" "P" "E" "R" "N" "E"
## [29] "R" "D"
Well I can see what the message is supposed to be, but I’m wondering if the punctuation is helpful as well. Lets get rid of lower case letter and digits:
str_replace_all(str_replace_all(raw.message,'[a-z]+',''), '\\d+', '')
## [1] "CONGRATULATIONS.YOU.ARE.A.SUPERNERD!"
There we go. Just one more change
str_replace_all(str_replace_all(str_replace_all(raw.message,'[a-z]+',''), '\\d+', ''), '\\.', ' ')
## [1] "CONGRATULATIONS YOU ARE A SUPERNERD!"