Load the Stringr library

library("stringr")

Copy the introductory example. The vector names stores the extracted names.

raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"  
#Use the character class [:alpha:] to extract alphabetic characters
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
#View name
name
## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name

#Identify names that already fit the description of first_name last_name
#Create a vector entitled proper
proper <-c(name[1],name[3],name[4],name[6])

#Identify names that need to be altered
#Create vectors with the prefix problem_
problem_Homer <-c(name[5])
problem_Burns <-c(name[2])

#Create a new vector entitled Homer_Fixed
#Use the str_split function to split problem_Homer based on the comma
Homer_Fixed <-unlist(str_split(problem_Homer,","))
#Use the str_trim to remove the spaces before Homer
Homer_Fixed <-str_trim(Homer_Fixed)
#Use the str_c which concatenates strings based on position
Homer_Fixed <-str_c(Homer_Fixed[2],Homer_Fixed[1],sep=" ")

#Create a new vector entitled Burns_Fixed following Homer_Fixed with a few minor changes in code
Burns_Fixed <-unlist(str_split(problem_Burns,","))
Burns_Fixed <-str_trim(Burns_Fixed)
Burns_Fixed <-str_c(Burns_Fixed[2],Burns_Fixed[1],sep=" ")

#Create a new vector entitled Simpons_First_Last and view it
Simpsons_First_Last <- c(proper,Homer_Fixed,Burns_Fixed)
Simpsons_First_Last
## [1] "Moe Szyslak"          "Rev. Timothy Lovejoy" "Ned Flanders"        
## [4] "Dr. Julius Hibbert"   "Homer Simpson"        "C. Montgomery Burns"

Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.)

#Create a vector entitled title_check
#Use the str_detect function to check for a period in the name
#Use the alphabetic characters class
#\\ before the period character is interpreted as a single literal backlash
#Quantifier was added to impose the restriction that the contents of the character class have to be matched twice
#View the name vector to highlight the order
name
## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"
title_check <- str_detect(name, "[[:alpha:]]{2,}\\.")
title_check
## [1] FALSE FALSE  TRUE FALSE FALSE  TRUE

Construct a logical vector indicating whether a character has a second name

#By eyeballing vector the only character with a second name is C. Montgomery Burns
#Use the str_detect function 
name
## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"
secondary_name <- str_detect(name, "[A-Z]\\.{1}")
secondary_name
## [1] FALSE  TRUE FALSE FALSE FALSE FALSE

Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression

Example 1

#[0-9] Numbers
#+ quantifier the preceding item will be matched one or more times
reg_ex="[0-9]+\\$"
example <-c("1219$")
example
## [1] "1219$"
str_detect(example,reg_ex)
## [1] TRUE
#Bad Example
example_b <-c("Hello World")
example_b
## [1] "Hello World"
str_detect(example_b,reg_ex)
## [1] FALSE

Example 2

#\\b word egde
#[a-z] letters
#{1,4} consists of 4 letters
reg_ex="\\b[a-z]{1,4}\\b"
example <-c("stop")
example
## [1] "stop"
str_detect(example,reg_ex)
## [1] TRUE
#Bad Example
example_b <-c("fifth")
example_b
## [1] "fifth"
str_detect(example_b,reg_ex)
## [1] FALSE

Example 3

#* the preceding item will be matched zero or more times
reg_ex=".*?\\.txt$"
example <-c("Regular Expressions and Essential String Functions.txt")
example
## [1] "Regular Expressions and Essential String Functions.txt"
str_detect(example,reg_ex)
## [1] TRUE
#Bad Example
example_b <-c("Regular Expressions and Essential String Functions.pdf")
example_b
## [1] "Regular Expressions and Essential String Functions.pdf"
str_detect(example_b,reg_ex)
## [1] FALSE

Example 4

#\\d{2} two numbers
#\\d{4} four numbers
reg_ex="\\d{2}/\\d{2}/\\d{4}"
example <-c("09/17/2017")
example
## [1] "09/17/2017"
str_detect(example,reg_ex)
## [1] TRUE
#Bad Example
example_b <-c("09/17/17")
example_b
## [1] "09/17/17"
str_detect(example_b,reg_ex)
## [1] FALSE

Example 5

#<> within
#.+? the preceding item will be matched one or more times, the preceding item is optional and will be matched at most once
reg_ex="<(.+?)>.+?</\\1>"
example <-c("<kobe>60 Points Final Game</kobe>")
example
## [1] "<kobe>60 Points Final Game</kobe>"
str_detect(example,reg_ex)
## [1] TRUE
#Bad Example
example_b <-c("<kobe>60 Points Final Game<kobe>")
example_b
## [1] "<kobe>60 Points Final Game<kobe>"
str_detect(example_b,reg_ex)
## [1] FALSE

Extra Credit

The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com

extra_credit <-"clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
extra_credit
## [1] "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
#Find all uppercase letters
str_extract_all(extra_credit, "[[:upper:]]")
## [[1]]
##  [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "Y" "O"
## [18] "U" "A" "R" "E" "A" "S" "U" "P" "E" "R" "N" "E" "R" "D"