unlist(x) - Produces a vector from a list (x)
str_extract(string,pattern) - Returns a substring matching the pattern or NA
str_extract_all(string, pattern) - Returns all substrings matching the pattern or NA
str_locate() Returns a position of first patterned matched
str_locate_all() Returns positions of all pattern matches (a list of matrices)
str_replace / all () Replaces pattern matches
str_split() - Splits a string at pattern
str_split_fixed() Splits string at pattern into fixed number of peices
str_detect() - detects patterns in string
str_count() - Counts number of pattern occurrences in string
library(stringr)
raw.data = c("555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert")
names = unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
phones = unlist(str_extract_all(raw.data, "\\(?(\\d{3,})?\\)?(-| )?\\d{3,}(-| )?\\d{1,}"))
myData = data.frame(name = names,phone = phones)
myData
## name phone
## 1 Moe Szyslak 555-1239
## 2 Burns, C. Montgomery (636) 555-0113
## 3 Rev. Timothy Lovejoy 555-6542
## 4 Ned Flanders 555 8904
## 5 Simpson, Homer 636-555-3226
## 6 Dr. Julius Hibbert 5553642
objective1 = function(x){
if (str_detect(x,",")){
str = str_split(x,", ")
str=paste(str[[1]][2],str[[1]][1], sep= " ")
return(str)
}
else if (str_detect(x,"[:alpha:]")){
return(str_extract(x,".*"))
}
}
myVector = sapply(myData$name,objective1)
myVector
## [1] "Moe Szyslak" "C. Montgomery Burns" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Homer Simpson" "Dr. Julius Hibbert"
objective2 = function(y){
return (str_detect(y,"[:upper:]{1}[:alpha:]+\\."))
}
cbind(eX = sapply(myVector,objective2))
## eX
## Moe Szyslak FALSE
## C. Montgomery Burns FALSE
## Rev. Timothy Lovejoy TRUE
## Ned Flanders FALSE
## Homer Simpson FALSE
## Dr. Julius Hibbert TRUE
I’m not sure what “second name” means…surname? A middle name? They all have two names, one has three.
#This function checks to see if they have a middle name, by looking for an initial
objective3 = function(z){
return (str_detect(z,"[:upper:]\\."))
}
cbind(resultVector = sapply(myVector,objective3))
## resultVector
## Moe Szyslak FALSE
## C. Montgomery Burns TRUE
## Rev. Timothy Lovejoy FALSE
## Ned Flanders FALSE
## Homer Simpson FALSE
## Dr. Julius Hibbert FALSE
cat ('\n\n')
#This function just tells us how many names they have, by removing their titles and counting white space
objective3.5 = function(L){
howManyWords = str_replace(L,"[:upper:]{1}[:alpha:]+\\. ","")
howManyWords = str_split(howManyWords," ")
return(paste("Has",length(howManyWords[[1]]), 'names'))
}
cbind(resultVector=sapply(myVector,objective3.5))
## resultVector
## Moe Szyslak "Has 2 names"
## C. Montgomery Burns "Has 3 names"
## Rev. Timothy Lovejoy "Has 2 names"
## Ned Flanders "Has 2 names"
## Homer Simpson "Has 2 names"
## Dr. Julius Hibbert "Has 2 names"
[0-9]+ also expressed as [:digit:]+ means 1 or more digits 0 through 9. \\ means literal, so \\$ looks for a dollar sign at the end….Example is 9999$
\\b’s are alphanumeric wrappers, [a-z] is also [:lower:] and {1,4} means at least one, no more than 4. This REGEX is used to find lower case words with less than 5 characters and more than 0.
Examples shed, boob, to, a.
“.” is a qualitative wildcard, ’*’ is a quantitative wildcard, ? Means the REGEX will be greedy, \\. means literally “.”, txt$ means end with “txt”…This REGEX is used to identify text files by file name. I would use this after maybe a files = os.listdir() function in python. It would find things like “lazyDog.txt”
\\d{2} means find 2 digits / is just a forward slash…This means find 2 digits, forward slash, 2 digits, forward slash, 4 digits. Practical use, finding dates listed either mm/dd/yyyy or dd/mm/yyyy.
Anything in the () can be called later in a REGEX by backreferencing with \\[:digit:]. “.+?” means get at least one, of as much as you can, of anything. This would get something like ‘< abc >Jojo</ abc>’. Its used for HTML and CSS since that stuff is written with ‘<></>’. The only catch is that backreferencing wants an EXACT copy.
Lets start this off with a bunch of functions, check their output
SecretCode = c('clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr')
str_extract_all(SecretCode,'[:upper:]+')
## [[1]]
## [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "AT" "I" "O" "N" "S"
## [15] "Y" "O" "U" "A" "R" "E" "A" "S" "U" "P" "E" "R" "N" "E"
## [29] "R" "D"
str_extract_all(SecretCode,'[:upper:][:lower:]{1,5}')
## [[1]]
## [1] "Cow" "Ovdicp" "Nuggvh" "Gjuwcz" "Rxs" "Aj" "Tanwo"
## [8] "Uwisdi" "Lj" "Idr" "Oaootj" "Nj" "Sfek" "Ywwoji"
## [15] "Urbz" "Anbhzg" "Ecrop" "Agnb" "Sqo" "Pa" "Em"
## [22] "Nd" "Rgxo" "Dk"
str_extract_all(SecretCode,'[:lower:]*')
## [[1]]
## [1] "clcop" "" "ow" "" "zmstc" ""
## [7] "d" "" "" "wnkig" "" ""
## [13] "vdicp" "" "uggvhryn" "" "" ""
## [19] "juwczi" "" "hqrfp" "" "xs" ""
## [25] "" "j" "" "dwpn" "" ""
## [31] "anwo" "" "" "wisdij" "" ""
## [37] "j" "" "kpf" "" "" ""
## [43] "" "" "" "dr" "" "coc"
## [49] "" "bt" "" "yczjat" "" "aootj"
## [55] "" "" "t" "" "" "j"
## [61] "" "ne" "" "c" "" ""
## [67] "fek" "" "r" "" "w" ""
## [73] "" "wwojig" "" "" "d" ""
## [79] "vrf" "" "rbz" "" "" ""
## [85] "bk" "" "nbhzgv" "" "" ""
## [91] "i" "" "" "z" "" "crop"
## [97] "" "w" "" "gnb" "" ""
## [103] "qo" "" "" "" "f" ""
## [109] "a" "" "otfb" "" "w" ""
## [115] "m" "" "" "k" "" "t"
## [121] "" "s" "" "" "zqe" ""
## [127] "" "fy" "" "" "n" ""
## [133] "" "d" "" "t" "" "kc"
## [139] "" "f" "" "" "" ""
## [145] "gmc" "" "" "gxo" "" "nh"
## [151] "" "k" "" "gr" ""
str_replace_all(SecretCode,'[:digit:]*',"")
## [1] "clcopCowzmstcdwnkigOvdicpNuggvhrynGjuwczihqrfpRxsAjdwpnTanwo UwisdijLjkpfATIdrcocbtyczjatOaootjtNjnecSfek.rwYwwojigO dvrfUrbz.bkAnbhzgvRizEcrop.wAgnb.SqoUfPaotfbwEmktsRzqe fynNdtkcfEgmcRgxonhDk!gr"
str_replace_all(SecretCode,'[:alpha:]*',"")
## [1] "10877928550 78035307553364.11 62.24905..6517246395 89659490545!"
Hmm,First check hit the jackpot, last check ends with an exclamation point, I’ll be taking that. The periods seem useful maybe?
clue = (str_extract_all(SecretCode,'[:upper:]|\\.|!'))
clue <- append(clue[[1]],".",after=31)
clue = paste(clue,collapse = '.')
clue
## [1] "C.O.N.G.R.A.T.U.L.A.T.I.O.N.S...Y.O.U...A.R.E...A...S.U.P.E.R...N.E.R.D.!"
I take offense.