RegEx

library(stringr)
raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5543642Dr. Julius Hibbert"
names<- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
names

## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name. 2 steps:

remove titles/middle name;
change first/last name position with “,” inside.

title_free<- str_trim(sub("[[:alpha:]]{1,3}\\.","",names)) 
first_last<- str_replace(title_free,"(\\w+),\\s+(\\w+)","\\2 \\1")
first_last

## [1] "Moe Szyslak"      "Montgomery Burns" "Timothy Lovejoy" 
## [4] "Ned Flanders"     "Homer Simpson"    "Julius Hibbert"

Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.)

has_title <- str_detect(names, "^[:alpha:]{2,3}\\.") 
title<-data.frame (names, title=has_title)
title

##                  names title
## 1          Moe Szyslak FALSE
## 2 Burns, C. Montgomery FALSE
## 3 Rev. Timothy Lovejoy  TRUE
## 4         Ned Flanders FALSE
## 5       Simpson, Homer FALSE
## 6   Dr. Julius Hibbert  TRUE

Construct a logical vector indicating whether a character has a second name.

second_name <- str_detect(names, " [:alpha:]\\.") 
second_name<-data.frame (names, second_name=second_name)
second_name

##                  names second_name
## 1          Moe Szyslak       FALSE
## 2 Burns, C. Montgomery        TRUE
## 3 Rev. Timothy Lovejoy       FALSE
## 4         Ned Flanders       FALSE
## 5       Simpson, Homer       FALSE
## 6   Dr. Julius Hibbert       FALSE

Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.

[0-9]+\$ Any numbers or span of numbers from 0 to 9 followed by a $

str_detect ("839503$", "[0-9]+\\$")

## [1] TRUE

str_detect ("3$", "[0-9]+\\$")

## [1] TRUE

\b[a-z]{1,4}\b Words that are constructed from 1 to 4 letters (lower case) surrounded by the word edge.

str_detect("buzz","\\b[a-z]{1,4}\\b")

## [1] TRUE

str_detect("z","\\b[a-z]{1,4}\\b")

## [1] TRUE

.*?\.txt$ It can be anything that ends with .txt

str_detect(".txt",".*?\\.txt$")

## [1] TRUE

str_detect("jslfn375hfv .txt",".*?\\.txt$")

## [1] TRUE

str_detect(" .txt",".*?\\.txt$")

## [1] TRUE

\d{2}/\d{2}/\d{4} Date format dd/mm/yyyy

str_detect("13/09/2018","\\d{2}/\\d{2}/\\d{4}")

## [1] TRUE

<(.+?)>.+?</\1> HTML tag

 str_detect("<<td>Defines the document type</td>>","<(.+?)>.+?</\\1>")

## [1] TRUE

The following code hides a secret message.

secret<- ("clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6.Nd5t9kc4fE905gmc4Rgxo5nhDk!gr")
secret <- unlist(str_extract_all(secret, "[[:upper:].?]{1,}"))
message <- str_replace_all(paste(secret, collapse = ''), "[.]", " ")
message

## [1] "CONGRATULATIONS YOU ARE A SUPER NERD"

RegEx

Olga Shiligin

16 September 2018