Github for homework: https://github.com/rweberc/Data607_Assignment3
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
names <- unlist(str_extract_all(raw.data, "([[:alpha:]., ]){2,}"))
ifelse(str_detect(names, ","),
str_c(str_extract(names, "[:word:]+$"), " ", str_extract(names,"^[:word:]+\\b")),
names)
## [1] "Moe Szyslak" "Montgomery Burns" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Homer Simpson" "Dr. Julius Hibbert"
firstLast <- unlist(str_replace_all(names, "[:alpha:]+\\. ", ""))
firstLast
## [1] "Moe Szyslak" "Burns, Montgomery" "Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Julius Hibbert"
ifelse(str_detect(firstLast, ","),
str_c(str_extract(firstLast, "[:word:]+$"), " ", str_extract(firstLast,"^[:word:]+\\b")),
firstLast)
## [1] "Moe Szyslak" "Montgomery Burns" "Timothy Lovejoy"
## [4] "Ned Flanders" "Homer Simpson" "Julius Hibbert"
str_detect(names, "^[:alpha:]+\\. ")
## [1] FALSE FALSE TRUE FALSE FALSE TRUE
Assuming “second name” means middle name
str_detect(names, "[^.] [:alpha:]+\\.? ")
## [1] FALSE TRUE FALSE FALSE FALSE FALSE
What does it describe?
- Numbers followed by a dollar sign (prices in dollars?)
str_detect("1234$", "[0-9]+\\$")
## [1] TRUE
What does it describe?
- presence of a single, lowercase letter in a string or on a line
str_detect(" a ", "\\b[a-z]\\b")
## [1] TRUE
str_detect("z", "\\b[a-z]\\b")
## [1] TRUE
str_detect("temp.txt", ".*?\\.txt$")
## [1] TRUE
str_detect("doc.temp.txt", ".*?\\.txt$")
## [1] TRUE
What does it describe?
- mdy or dmy dates
str_detect("12/31/1980", "\\d{2}/\\d{2}/\\d{4}")
## [1] TRUE
str_detect("31/12/1980", "\\d{2}/\\d{2}/\\d{4}")
## [1] TRUE
What does it describe?
- whole xml tag sequence with sometime in between
unlist(str_extract_all("<html><abc></abc></html>", "<(.+?)>.+?</\\1>"))
## [1] "<html><abc></abc></html>"
unlist(str_extract_all("<html><abc>asdf</abc></html>", "<(.+?)>.+?</\\1>"))
## [1] "<html><abc>asdf</abc></html>"
unlist(str_extract_all("<html> </html>", "<(.+?)>.+?</\\1>"))
## [1] "<html> </html>"
unlist(str_extract_all("<html> </html>", "<(.+?)>.*</\\1>"))
## [1] "<html> </html>"
unlist(str_extract_all("<html><script>asdf</script><body><text>def</text></body></html>", "<(.+)>.+</\\1>"))
## [1] "<html><script>asdf</script><body><text>def</text></body></html>"
code <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
nums <- unlist(str_extract_all(code, "[:digit:]+|[:punct:]+"))
nums
## [1] "1" "0" "87" "7" "92" "8" "5" "5" "0" "7" "8"
## [12] "03" "5" "3" "0" "7" "55" "3" "3" "6" "4" "."
## [23] "1" "1" "6" "2" "." "2" "4" "9" "05" "." "."
## [34] "65" "1" "7" "24" "6" "3" "9" "5" "89" "6" "5"
## [45] "9" "4" "905" "4" "5" "!"
letters <- unlist(str_extract_all(code, "[:alpha:]+"))
letters
## [1] "clcopCow" "zmstc" "d"
## [4] "wnkig" "OvdicpNuggvhryn" "Gjuwczi"
## [7] "hqrfpRxs" "Aj" "dwpn"
## [10] "TanwoUwisdij" "Lj" "kpf"
## [13] "AT" "Idr" "coc"
## [16] "bt" "yczjatOaootj" "t"
## [19] "Nj" "ne" "c"
## [22] "Sfek" "r" "w"
## [25] "YwwojigOd" "vrfUrbz" "bkAnbhzgv"
## [28] "R" "i" "zEcrop"
## [31] "wAgnb" "SqoU" "fPa"
## [34] "otfb" "wEm" "k"
## [37] "t" "sR" "zqe"
## [40] "fy" "n" "Nd"
## [43] "t" "kc" "fE"
## [46] "gmc" "Rgxo" "nhDk"
## [49] "gr"
str_length(str_c(letters, collapse=""))
## [1] 188
capLetters <- str_extract_all(code, "[A-Z]")
capLetters
## [[1]]
## [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "Y" "O"
## [18] "U" "A" "R" "E" "A" "S" "U" "P" "E" "R" "N" "E" "R" "D"
oh
LKJLgKASJDeFOIJOOIeJGNOIAJtSDOFhINUAOaPSFInUOkDsJ!