Github for homework: https://github.com/rweberc/Data607_Assignment3

Raw string

raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"

Extract names

names <- unlist(str_extract_all(raw.data, "([[:alpha:]., ]){2,}"))

1. Output first and last name ordered

Keep titles

If value contains “,”, then switch the order of first and last

ifelse(str_detect(names, ","), 
                str_c(str_extract(names, "[:word:]+$"), " ", str_extract(names,"^[:word:]+\\b")),
                names)
## [1] "Moe Szyslak"          "Montgomery Burns"     "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Homer Simpson"        "Dr. Julius Hibbert"

Remove titles

Remove middle initials and/or Titles

firstLast <- unlist(str_replace_all(names, "[:alpha:]+\\. ", ""))
firstLast
## [1] "Moe Szyslak"       "Burns, Montgomery" "Timothy Lovejoy"  
## [4] "Ned Flanders"      "Simpson, Homer"    "Julius Hibbert"

If value contains “,”, then switch the order of first and last

ifelse(str_detect(firstLast, ","), 
       str_c(str_extract(firstLast, "[:word:]+$"), " ", str_extract(firstLast,"^[:word:]+\\b")),
       firstLast)
## [1] "Moe Szyslak"      "Montgomery Burns" "Timothy Lovejoy" 
## [4] "Ned Flanders"     "Homer Simpson"    "Julius Hibbert"

2. Logical vector of names with a title

str_detect(names, "^[:alpha:]+\\. ")
## [1] FALSE FALSE  TRUE FALSE FALSE  TRUE

3. Logical vector or names with a “second name”

Assuming “second name” means middle name

str_detect(names, "[^.] [:alpha:]+\\.? ")
## [1] FALSE  TRUE FALSE FALSE FALSE FALSE

4 Regex examples

a. [0-9]+\$

What does it describe?
- Numbers followed by a dollar sign (prices in dollars?)

str_detect("1234$", "[0-9]+\\$")
## [1] TRUE

b. \b[a-z]{1,4}\b

What does it describe?
- presence of a single, lowercase letter in a string or on a line

str_detect(" a ", "\\b[a-z]\\b")
## [1] TRUE
str_detect("z", "\\b[a-z]\\b")
## [1] TRUE

c. .*?\.txt$
What does it describe?
- text file document name

str_detect("temp.txt", ".*?\\.txt$")
## [1] TRUE
str_detect("doc.temp.txt", ".*?\\.txt$")
## [1] TRUE

d. \d{2}/\d{2}/\d{4}

What does it describe?
- mdy or dmy dates

str_detect("12/31/1980", "\\d{2}/\\d{2}/\\d{4}")
## [1] TRUE
str_detect("31/12/1980", "\\d{2}/\\d{2}/\\d{4}")
## [1] TRUE

e. <(.+?)>.+?</\1> # Not clear to me what the first “?” adds, or the case where “.+?” is more restrictive than “.+”

What does it describe?
- whole xml tag sequence with sometime in between

  unlist(str_extract_all("<html><abc></abc></html>", "<(.+?)>.+?</\\1>"))
## [1] "<html><abc></abc></html>"
  unlist(str_extract_all("<html><abc>asdf</abc></html>", "<(.+?)>.+?</\\1>"))
## [1] "<html><abc>asdf</abc></html>"
  unlist(str_extract_all("<html> </html>", "<(.+?)>.+?</\\1>"))
## [1] "<html> </html>"
  unlist(str_extract_all("<html> </html>", "<(.+?)>.*</\\1>"))
## [1] "<html> </html>"
  unlist(str_extract_all("<html><script>asdf</script><body><text>def</text></body></html>", "<(.+)>.+</\\1>"))
## [1] "<html><script>asdf</script><body><text>def</text></body></html>"
  1. Try to decode string

String to decode

code <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"

Inspect

nums <- unlist(str_extract_all(code, "[:digit:]+|[:punct:]+"))
nums
##  [1] "1"   "0"   "87"  "7"   "92"  "8"   "5"   "5"   "0"   "7"   "8"  
## [12] "03"  "5"   "3"   "0"   "7"   "55"  "3"   "3"   "6"   "4"   "."  
## [23] "1"   "1"   "6"   "2"   "."   "2"   "4"   "9"   "05"  "."   "."  
## [34] "65"  "1"   "7"   "24"  "6"   "3"   "9"   "5"   "89"  "6"   "5"  
## [45] "9"   "4"   "905" "4"   "5"   "!"
letters <- unlist(str_extract_all(code, "[:alpha:]+"))
letters
##  [1] "clcopCow"        "zmstc"           "d"              
##  [4] "wnkig"           "OvdicpNuggvhryn" "Gjuwczi"        
##  [7] "hqrfpRxs"        "Aj"              "dwpn"           
## [10] "TanwoUwisdij"    "Lj"              "kpf"            
## [13] "AT"              "Idr"             "coc"            
## [16] "bt"              "yczjatOaootj"    "t"              
## [19] "Nj"              "ne"              "c"              
## [22] "Sfek"            "r"               "w"              
## [25] "YwwojigOd"       "vrfUrbz"         "bkAnbhzgv"      
## [28] "R"               "i"               "zEcrop"         
## [31] "wAgnb"           "SqoU"            "fPa"            
## [34] "otfb"            "wEm"             "k"              
## [37] "t"               "sR"              "zqe"            
## [40] "fy"              "n"               "Nd"             
## [43] "t"               "kc"              "fE"             
## [46] "gmc"             "Rgxo"            "nhDk"           
## [49] "gr"

Do numbers point to letters in the string?

905 much larger than string length

str_length(str_c(letters, collapse=""))
## [1] 188

See what capital letters there are

capLetters <- str_extract_all(code, "[A-Z]")
capLetters
## [[1]]
##  [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "Y" "O"
## [18] "U" "A" "R" "E" "A" "S" "U" "P" "E" "R" "N" "E" "R" "D"

oh

LKJLgKASJDeFOIJOOIeJGNOIAJtSDOFhINUAOaPSFInUOkDsJ!