————————————————————–
AUTOMATED DATA COLLECTION WITH R
CODE CHAPTER 8: REGULAR EXPRESSIONS AND ESSENTIAL STRING FUNCTIONS
Problems 3,7,8,9
————————————————————–
  1. Load Packages
library(stringr)
library(XML)
library(RCurl)
## Loading required package: bitops
library(bitops)
library(tau)
  1. Problem #3
    Copy the introductory example. The vector name stroes the extracted names.
##start by assigning a raw data vector of one element which mixes phone and name data
raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5543642Dr. Julius Hibbert"
##extract a vector of name elements 
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name
## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"
##extract a vector of phone elements
phone <- unlist(str_extract_all(raw.data, "\\(?(\\d{3})?\\)?(-| )?\\d{3}(-| )?\\d{4}"))
phone <- unlist(str_extract_all(raw.data, "\\(?(\\d{3})?\\)?(-| )?\\d{3}(-| )?\\d{4}"))
phone
## [1] "555-1239"       "(636) 555-0113" "555-6542"       "555 8904"      
## [5] "636-555-3226"   "5543642"
##assign elements to dataframe
data.frame(name = name, phone = phone)
##                   name          phone
## 1          Moe Szyslak       555-1239
## 2 Burns, C. Montgomery (636) 555-0113
## 3 Rev. Timothy Lovejoy       555-6542
## 4         Ned Flanders       555 8904
## 5       Simpson, Homer   636-555-3226
## 6   Dr. Julius Hibbert        5543642
(a) Use the tools of the chapter to rearrange the vector so that all elements conform to the standard first_name last_name.
##extract a vector of fname elements 
fname <- unlist(str_extract_all(name, "[:punct:] [[:alpha:]]{2,}$|[[:alpha:]]{2,} "))
fname
## [1] "Moe "         ". Montgomery" "Timothy "     "Ned "        
## [5] ", Homer"      "Julius "
fname <- unlist(str_extract_all(fname,"[[:alpha:]]{2,}"))
fname
## [1] "Moe"        "Montgomery" "Timothy"    "Ned"        "Homer"     
## [6] "Julius"
##extract a vector of lname elements 
lname <- unlist(str_extract_all(name, "[^[:punct:]] [[:alpha:]]{2,}$|[[:alpha:]]{2,}, "))
lname <- unlist(str_extract_all(lname,"[[:alpha:]]{2,}"))
lname
## [1] "Szyslak"  "Burns"    "Lovejoy"  "Flanders" "Simpson"  "Hibbert"
##extract a vector of title elements 
title <- unlist(str_extract_all(name, "[[:alpha:]]{2,}\\."))
title
## [1] "Rev." "Dr."
title_exists <- unlist(str_detect(name, title))
title_exists
## [1] FALSE FALSE  TRUE FALSE FALSE  TRUE
##update the name dataframe
data.frame(fname=fname, lname=lname, title_exists=title_exists)
##        fname    lname title_exists
## 1        Moe  Szyslak        FALSE
## 2 Montgomery    Burns        FALSE
## 3    Timothy  Lovejoy         TRUE
## 4        Ned Flanders        FALSE
## 5      Homer  Simpson        FALSE
## 6     Julius  Hibbert         TRUE
fname2 <- unlist(str_extract_all(name, " [[:alpha:]]{1}\\.? [[:alpha:]]{1,}\\.?"))
fname2 <- unlist(str_extract_all(fname2,"[[:alpha:]]{1}\\.? [[:alpha:]]{1,}\\.?"))
fname2
## [1] "C. Montgomery"
fname2_exists <- unlist(str_detect(name, " [[:alpha:]]{1}\\.? [[:alpha:]]{1,}\\.?"))
fname2_exists
## [1] FALSE  TRUE FALSE FALSE FALSE FALSE
data.frame(fname=fname, lname=lname, phone=phone, title_exists=title_exists, fname2_exists=fname2_exists)
##        fname    lname          phone title_exists fname2_exists
## 1        Moe  Szyslak       555-1239        FALSE         FALSE
## 2 Montgomery    Burns (636) 555-0113        FALSE          TRUE
## 3    Timothy  Lovejoy       555-6542         TRUE         FALSE
## 4        Ned Flanders       555 8904        FALSE         FALSE
## 5      Homer  Simpson   636-555-3226        FALSE         FALSE
## 6     Julius  Hibbert        5543642         TRUE         FALSE
  1. Problem #7 The expression “<.+>” doesn’t repeat the alpha characters contained in the tag
raw.data <- "<title>+++BREAKING NEWS+++</title>"
html <- str_extract(raw.data,"<[[:alpha:]]{1,}>")
html
## [1] "<title>"
  1. Problem #8 The expression [^1-9=+*()] is replaced with “[^[1-9]{1,}]” which captures =+*()
raw.data <- "(5-3)^2=5^2-2*5*3+3^2"
binomial <- unlist(str_extract_all(raw.data, "[^[1-9]{1,}]"))
binomial
##  [1] "(" "-" ")" "^" "=" "^" "-" "*" "*" "+" "^"