The following raw.data string was taken from: http://www.r-datacollection.com/materials/ch-8-regex/ch-8-regex.r
3) Setup: Copy the intro example:
library(stringr)
library(XML)
library(RCurl)
library(tau)
# add to the list of titles if necessary...
titles <- "(Mr.|Mrs.|Dr.|Rev.|Prof.)"
# A difficult example
raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5543642Dr. Julius Hibbert"
# Extract information
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
phone <- unlist(str_extract_all(raw.data, "\\(?(\\d{3})?\\)?(-| )?\\d{3}(-| )?\\d{4}"))
df <- data.frame(name = name)
df
## name
## 1 Moe Szyslak
## 2 Burns, C. Montgomery
## 3 Rev. Timothy Lovejoy
## 4 Ned Flanders
## 5 Simpson, Homer
## 6 Dr. Julius Hibbert
3a) extract firstname lastname
df$firstname_lastname <- df$name
# get rid of all the titles
df$firstname_lastname <- str_replace_all(df$firstname_lastname, titles, "")
# get rid of the commas and swap first and last name if the comma was there..
df$firstname_lastname <- gsub("([A-Za-z]+),\\s+([A-Za-z]+)", "\\2 \\1", df$firstname_lastname)
# get rid of extrar spaces..
df$firstname_lastname <- sub("^\\s+", "", df$firstname_lastname)
3b) construct logical vector stating whether the name contains a title, ie - dr, mr..
df$has_title <- (str_count(df$name, titles) > 0)
3c) construct logical vector stating whether the name contains a second name
df$has_second_name <- (sapply(gregexpr("\\W+", df$firstname_lastname), length) > 1)
problem 3 results:
df
## name firstname_lastname has_title has_second_name
## 1 Moe Szyslak Moe Szyslak FALSE FALSE
## 2 Burns, C. Montgomery C Burns. Montgomery FALSE TRUE
## 3 Rev. Timothy Lovejoy Timothy Lovejoy TRUE FALSE
## 4 Ned Flanders Ned Flanders FALSE FALSE
## 5 Simpson, Homer Homer Simpson FALSE FALSE
## 6 Dr. Julius Hibbert Julius Hibbert TRUE FALSE
7) Extract the first html tag from the following string:
<.+> doesn’t work because the . includes everything, including the closing tag >. It is ‘greedy’. Using [a-z] says “take the open tag”, but don’t let the closed tag encapsulate another closed tag, only alpha characters.
the_html_text <- "<title>+++BREAKING NEWS+++</title>"
does_not_work <- str_extract(the_html_text, "<.+>")
does_not_work
## [1] "<title>+++BREAKING NEWS+++</title>"
does_work <- str_extract(the_html_text, "<([a-z]+)>")
does_work
## [1] "<title>"
8) Why does the bad regexp fail?
formula: (5-3)2=52-253+3^2 bad regexp: [^0-9=+*()]+.
Answer: put the ^ at the end of the expression
formula <- '(5-3)^2=5^2-2*5*3+3^2'
bad_regexp <- '[^0-9=+*()-]+'
str_extract(formula, bad_regexp)
## [1] "^"
# fix: put the ^ at the end of the string, so the interpreter knows you actually mean the ^ character
# the r docs say: To include a literal ^, place it anywhere but first.
fixed_regexp <- '[0-9=+*()-^]+'
str_extract(formula, fixed_regexp)
## [1] "(5-3)^2=5^2-2*5*3+3^2"