DATA 607 - Assignment 3

SG

  library(stringr)

3. Copy the introductory example. The vector name stores the extracted names.

  name <- c("Moe Szyslak", "Burns, C. Montgomery", "Rev. Timothy Lovejoy", "Ned Flanders", "Simpson, Homer", "Dr. Julius Hibbert")
  name
## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

Problem 1. Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.
First we split each row in the vector by the comma followed by the space:

  splitNames <- str_split(name, ", ", simplify = TRUE)

Next, we concatenate the first name with the last name, separated by a space:

  str_c(splitNames[,2], " ", splitNames[,1])
## [1] " Moe Szyslak"          "C. Montgomery Burns"   " Rev. Timothy Lovejoy"
## [4] " Ned Flanders"         "Homer Simpson"         " Dr. Julius Hibbert"

Problem 2. Construct a logical vector indicating whether a character has a title(i.e., Rev. and Dr.)

  name
## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"
  isTitledName <- str_detect(string = name, pattern = "^\\w+[.][[:space:]](\\w+[[:space:]]\\w+)*")
  isTitledName
## [1] FALSE FALSE  TRUE FALSE FALSE  TRUE

Problem 3. Construct a logical vector indicating whether a character has a second name.

  name
## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"
  hasSecondName <- str_detect(string = name, pattern = "[[:alpha:]]+([,]|[.])?[[:space:]][[:alpha:]]+([.]|[,]^)?[[:space:]]([[:alpha:]]+([.]|[,]^)?([[:space:]])?)*")
  hasSecondName
## [1] FALSE  TRUE  TRUE FALSE FALSE  TRUE

4. Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.

Problem 1."[0-9]+\$" One or more digits that end in $

  example <- c("12345\\", "0$", "123456$", "$")
  regexExample <- unlist(str_extract_all(example, pattern="[0-9]+\\$", simplify=TRUE))
  regexExample
##      [,1]     
## [1,] ""       
## [2,] "0$"     
## [3,] "123456$"
## [4,] ""

Problem 2. \b[a-z]{1-4}\b
Lowercase letters, of a minumum length of 1 to a maximum length of 4.

example <- c("abcd", "helloWorld", "abc", "1,4", "e", "abcdefg")
regexExample <- str_extract_all(example, pattern = "\\b[a-z]{1,4}\\b", simplify=TRUE)
regexExample
##      [,1]  
## [1,] "abcd"
## [2,] ""    
## [3,] "abc" 
## [4,] ""    
## [5,] "e"   
## [6,] ""

Problem 3. .*?\.txt$
Any string of any characters that ends in .txt

  example <- c(".txt", "test.txt", "abc", "12345&*@&#$.txt", "test.TXT", " .txt", ".txttxt", ".txtHello")
  regexExample <- str_extract_all(example, ".*?\\.txt$", simplify=TRUE)
  regexExample
##      [,1]             
## [1,] ".txt"           
## [2,] "test.txt"       
## [3,] ""               
## [4,] "12345&*@&#$.txt"
## [5,] ""               
## [6,] " .txt"          
## [7,] ""               
## [8,] ""

Problem 4. \d{2}/\d{2}/\d{4}
This regex takes in a date format. String: two digits followed by a slash, followed by two digits, followed by another slash, followed by 4 digits

  example <- c("02/17/2019", "12/", "00/00/0000", "abc/a/aa", "1/1/1111")
  regexExample <- str_extract_all(example, "\\d{2}/\\d{2}/\\d{4}", simplify=TRUE)
  regexExample
##      [,1]        
## [1,] "02/17/2019"
## [2,] ""          
## [3,] "00/00/0000"
## [4,] ""          
## [5,] ""

Problem 5. <(.+?)>.+?</\1> This regex seems to be for HTML formats. The string it takes is: any set of characters between < and >, followed by any set of characters, followed by <, the same set of characters that were previously between < and >, a slash, and then >

  example <- c("<b>bold text</b>", "<a>?</a>", "<b>Work</a>", "<#$%&*>Hello</#$%&*>", "<h1></h1>")
regexExample <- str_extract_all(example, "<(.+?)>.+?</\\1>", simplify=TRUE)
regexExample
##      [,1]                  
## [1,] "<b>bold text</b>"    
## [2,] "<a>?</a>"            
## [3,] ""                    
## [4,] "<#$%&*>Hello</#$%&*>"
## [5,] ""

9. Extra Credit: clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr

This message reads as CONGRATULATIONS YOU ARE A SUPER NERD!

  extraCredit <- (unlist(str_extract_all("clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr", pattern="[A-Z]", simplify=TRUE)))
  extraCredit
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
## [1,] "C"  "O"  "N"  "G"  "R"  "A"  "T"  "U"  "L"  "A"   "T"   "I"   "O"  
##      [,14] [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24]
## [1,] "N"   "S"   "Y"   "O"   "U"   "A"   "R"   "E"   "A"   "S"   "U"  
##      [,25] [,26] [,27] [,28] [,29] [,30] [,31]
## [1,] "P"   "E"   "R"   "N"   "E"   "R"   "D"