library(stringr)
raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5543642Dr. Julius Hibbert"
names<- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
names
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
- Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name. 2 steps:
- remove titles/middle name;
- change first/last name position with “,” inside.
title_free<- str_trim(sub("[[:alpha:]]{1,3}\\.","",names))
first_last<- str_replace(title_free,"(\\w+),\\s+(\\w+)","\\2 \\1")
first_last
## [1] "Moe Szyslak" "Montgomery Burns" "Timothy Lovejoy"
## [4] "Ned Flanders" "Homer Simpson" "Julius Hibbert"
- Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.)
has_title <- str_detect(names, "^[:alpha:]{2,3}\\.")
title<-data.frame (names, title=has_title)
title
## names title
## 1 Moe Szyslak FALSE
## 2 Burns, C. Montgomery FALSE
## 3 Rev. Timothy Lovejoy TRUE
## 4 Ned Flanders FALSE
## 5 Simpson, Homer FALSE
## 6 Dr. Julius Hibbert TRUE
- Construct a logical vector indicating whether a character has a second name.
second_name <- str_detect(names, " [:alpha:]\\.")
second_name<-data.frame (names, second_name=second_name)
second_name
## names second_name
## 1 Moe Szyslak FALSE
## 2 Burns, C. Montgomery TRUE
## 3 Rev. Timothy Lovejoy FALSE
## 4 Ned Flanders FALSE
## 5 Simpson, Homer FALSE
## 6 Dr. Julius Hibbert FALSE
- Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.
- [0-9]+\$ Any numbers or span of numbers from 0 to 9 followed by a $
str_detect ("839503$", "[0-9]+\\$")
## [1] TRUE
str_detect ("3$", "[0-9]+\\$")
## [1] TRUE
- \b[a-z]{1,4}\b Words that are constructed from 1 to 4 letters (lower case) surrounded by the word edge.
str_detect("buzz","\\b[a-z]{1,4}\\b")
## [1] TRUE
str_detect("z","\\b[a-z]{1,4}\\b")
## [1] TRUE
- .*?\.txt$ It can be anything that ends with .txt
str_detect(".txt",".*?\\.txt$")
## [1] TRUE
str_detect("jslfn375hfv .txt",".*?\\.txt$")
## [1] TRUE
str_detect(" .txt",".*?\\.txt$")
## [1] TRUE
- \d{2}/\d{2}/\d{4} Date format dd/mm/yyyy
str_detect("13/09/2018","\\d{2}/\\d{2}/\\d{4}")
## [1] TRUE
- <(.+?)>.+?</\1> HTML tag
str_detect("<<td>Defines the document type</td>>","<(.+?)>.+?</\\1>")
## [1] TRUE
- The following code hides a secret message.
secret<- ("clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6.Nd5t9kc4fE905gmc4Rgxo5nhDk!gr")
secret <- unlist(str_extract_all(secret, "[[:upper:].?]{1,}"))
message <- str_replace_all(paste(secret, collapse = ''), "[.]", " ")
message
## [1] "CONGRATULATIONS YOU ARE A SUPER NERD"