Create subset of this vector that has only names and call it name.
- Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.
char2<- str_replace(name[[2]], "([[:alpha:]]{5}.) ([[:alpha:]]{1}.) ([[:alpha:]]{10})" , "\\3 \\2 \\1")
char5<- str_replace(name[[5]], "([[:alpha:]]{7}.) ([[:alpha:]]{5})" , "\\2 \\1")
name2<- str_replace(c(name[[1]], char2, name[[3]], name[[4]], char5, name[[6]]), ",", "")
name2
## [1] "Moe Szyslak" "Montgomery C. Burns" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Homer Simpson" "Dr. Julius Hibbert"
- Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).
str_detect(name, "[[:alpha:]]{2,}\\.")
## [1] FALSE FALSE TRUE FALSE FALSE TRUE
- Construct a logical vector indicating whether a character has a second name.
str_detect(name, "[[:alpha:]]{4}.+[[:alpha:]].+[[:alpha:]]{10}")
## [1] FALSE TRUE FALSE FALSE FALSE FALSE
- [0-9]+\$ - this pattern is for any amount of digits between 0-9 followed by the $ sign. Can be used to subset currency information for dollar currencies.
str_view_all(c("10£","100$BZ", "1000\u20AC","1000$US"), "[0-9]+\\$")
- \b[a-z]{1,4}\b - this pattern matches lower case words with of length 1 to 4 characters.
str_view_all(c("Copy the introductory example. The vector name stores the extracted names"), "\\b[a-z]{1,4}\\b", match = TRUE)
- .*?\.txt$ - this patern looks at strings that end with “.txt”. Can be used to subset text files.
str_view_all(c("name.rmd","name.doc", "name.xlsx","name.txt",".txt"), ".*?\\.txt$")
- \d{2}/\d{2}/\d{4} - this pattern matches dates in the formatt mm/dd/yyyy or dd/mm/yyyy
str_view_all(c("19/02/2017","19th February, 2017","19-02-2017", "Feb.19.2017","02/19/2017 10:30:15"), "\\d{2}/\\d{2}/\\d{4}")
- <(.+?)>.+?</\1> - this pattern matches html tags and their contents.
webpage <- getURL("http://global.nba.com/wp-content/include/global/splash_page/promo.html?gr=www", followlocation=TRUE)
unlist(str_extract_all(webpage, "<(.+?)>.+?</\\1>"))
## [1] "<title>NBA.com: Promo</title>"
- The following code hides a secret message. Crack it with R and regular expressions.
# Source "https://www.r-bloggers.com/htmltotext-extracting-text-from-html-via-xpath/"
# download html
html <- getURL("http://www.r-datacollection.com/materials/regex/code_exercise.txt", followlocation = TRUE)
# parse html
doc = htmlParse(html, asText=TRUE)
code_exercise <- xpathSApply(doc, "//p", xmlValue)
cat(paste(code_exercise, collapse = "\n"))
## clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr
#this code extracts all the capital letters, . and ! from the code_exercise text
#it then concatenates all the elements together into 1 string and finally replaces the "." separators with blank spaces
str_replace_all(str_c(unlist(str_extract_all(code_exercise, "[A-Z]|\\.|!")), collapse = ""), "\\.", " ")
## [1] "CONGRATULATIONS YOU ARE A SUPERNERD!"