DA607 Week 04 Assignment

The following raw.data string was taken from: http://www.r-datacollection.com/materials/ch-8-regex/ch-8-regex.r

3) Setup: Copy the intro example:

library(stringr)
library(XML)
library(RCurl)
library(tau)

# add to the list of titles if necessary...
titles <- "(Mr.|Mrs.|Dr.|Rev.|Prof.)"

# A difficult example
raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5543642Dr. Julius Hibbert"

# Extract information
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
phone <- unlist(str_extract_all(raw.data, "\\(?(\\d{3})?\\)?(-| )?\\d{3}(-| )?\\d{4}"))
df <- data.frame(name = name)
df

##                   name
## 1          Moe Szyslak
## 2 Burns, C. Montgomery
## 3 Rev. Timothy Lovejoy
## 4         Ned Flanders
## 5       Simpson, Homer
## 6   Dr. Julius Hibbert

3a) extract firstname lastname

df$firstname_lastname <- df$name 
# get rid of all the titles
df$firstname_lastname <- str_replace_all(df$firstname_lastname, titles, "") 
# get rid of the commas and swap first and last name if the comma was there..
df$firstname_lastname <- gsub("([A-Za-z]+),\\s+([A-Za-z]+)", "\\2 \\1", df$firstname_lastname)
# get rid of extrar spaces..
df$firstname_lastname <- sub("^\\s+", "", df$firstname_lastname)

3b) construct logical vector stating whether the name contains a title, ie - dr, mr..

df$has_title <- (str_count(df$name, titles) > 0)

3c) construct logical vector stating whether the name contains a second name

df$has_second_name <- (sapply(gregexpr("\\W+", df$firstname_lastname), length) > 1)

problem 3 results:

df

##                   name  firstname_lastname has_title has_second_name
## 1          Moe Szyslak         Moe Szyslak     FALSE           FALSE
## 2 Burns, C. Montgomery C Burns. Montgomery     FALSE            TRUE
## 3 Rev. Timothy Lovejoy     Timothy Lovejoy      TRUE           FALSE
## 4         Ned Flanders        Ned Flanders     FALSE           FALSE
## 5       Simpson, Homer       Homer Simpson     FALSE           FALSE
## 6   Dr. Julius Hibbert      Julius Hibbert      TRUE           FALSE

7) Extract the first html tag from the following string:

+++BREAKING NEWS+++

<.+> doesn’t work because the . includes everything, including the closing tag >. It is ‘greedy’. Using [a-z] says “take the open tag”, but don’t let the closed tag encapsulate another closed tag, only alpha characters.

the_html_text <- "<title>+++BREAKING NEWS+++</title>"
does_not_work <- str_extract(the_html_text, "<.+>")
does_not_work

## [1] "<title>+++BREAKING NEWS+++</title>"

does_work <- str_extract(the_html_text, "<([a-z]+)>")
does_work

## [1] "<title>"

8) Why does the bad regexp fail?

formula: (5-3)²⁼⁵2-253+3^2 bad regexp: [^0-9=+*()]+.

Answer: put the ^ at the end of the expression

formula <- '(5-3)^2=5^2-2*5*3+3^2'

bad_regexp <- '[^0-9=+*()-]+'
str_extract(formula, bad_regexp)

## [1] "^"

# fix: put the ^ at the end of the string, so the interpreter knows you actually mean the ^ character
# the r docs say: To include a literal ^, place it anywhere but first.
fixed_regexp <- '[0-9=+*()-^]+'
str_extract(formula, fixed_regexp)

## [1] "(5-3)^2=5^2-2*5*3+3^2"

DA607 Week 04 Assignment

Dan Fanelli

February 17, 2016