Homework Questions
- Copy the introductory example. The vector name stores the extracted names. R> name [1] “Moe Szyslak” “Burns, C. Montgomery” “Rev. Timothy Lovejoy” [4] “Ned Flanders” “Simpson, Homer” “Dr. Julius Hibbert”
library(stringr)
raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5543642Dr. Julius Hibbert"
raw.data
## [1] "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5543642Dr. Julius Hibbert"
- Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
name_remove_title_middle <- str_replace(name, "([[:alpha:]]{1,3}\\.\\s)", "")
name_first_last <- str_replace(name_remove_title_middle, "([[:alpha:]]+), ([[:alpha:]]+)", "\\2 \\1")
name_first_last
## [1] "Moe Szyslak" "Montgomery Burns" "Timothy Lovejoy"
## [4] "Ned Flanders" "Homer Simpson" "Julius Hibbert"
- Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).
str_detect(name, "Dr.|Rev.")
## [1] FALSE FALSE TRUE FALSE FALSE TRUE
- Construct a logical vector indicating whether a character has a second name.
str_count(str_trim(str_replace_all(name, "Dr.|Rev.", "")), " ") > 1
## [1] FALSE TRUE FALSE FALSE FALSE FALSE
- Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.
- [0-9]+\$
t<-c("test$", "$24$03", 2403)
unlist(str_extract_all(t, "[0-9]+\\$"))
## [1] "24$"
- \b[a-z]{1,4}\b
st<-"This is a test of question 4b"
unlist(str_extract_all(st, "\\b[a-z]{1,4}\\b"))
## [1] "is" "a" "test" "of"
- .*?\.txt$
tt<-c("test.txt","test.xlsx","test.docx")
unlist(str_extract_all(tt, ".*?\\.txt$"))
## [1] "test.txt"
- \d{2}/\d{2}/\d{4}
ft<- c("2/12/19","02/12/2019","02/12/19")
unlist(str_extract_all(ft,"\\d{2}/\\d{2}/\\d{4}"))
## [1] "02/12/2019"
- <(.+?)>.+?</\1>
fit<-c("<test>blah</test>","<\test>","<tester>")
unlist(str_extract_all(fit,"<(.+?)>.+?</\\1>"))
## [1] "<test>blah</test>"