————————————————————–
AUTOMATED DATA COLLECTION WITH R
CODE CHAPTER 8: REGULAR EXPRESSIONS AND ESSENTIAL STRING FUNCTIONS
Problems 3,7,8,9
————————————————————–
- Load Packages
library(stringr)
library(XML)
library(RCurl)
## Loading required package: bitops
library(bitops)
library(tau)
- Problem #3
Copy the introductory example. The vector name stroes the extracted names.
##start by assigning a raw data vector of one element which mixes phone and name data
raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5543642Dr. Julius Hibbert"
##extract a vector of name elements
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
##extract a vector of phone elements
phone <- unlist(str_extract_all(raw.data, "\\(?(\\d{3})?\\)?(-| )?\\d{3}(-| )?\\d{4}"))
phone <- unlist(str_extract_all(raw.data, "\\(?(\\d{3})?\\)?(-| )?\\d{3}(-| )?\\d{4}"))
phone
## [1] "555-1239" "(636) 555-0113" "555-6542" "555 8904"
## [5] "636-555-3226" "5543642"
##assign elements to dataframe
data.frame(name = name, phone = phone)
## name phone
## 1 Moe Szyslak 555-1239
## 2 Burns, C. Montgomery (636) 555-0113
## 3 Rev. Timothy Lovejoy 555-6542
## 4 Ned Flanders 555 8904
## 5 Simpson, Homer 636-555-3226
## 6 Dr. Julius Hibbert 5543642
(a) Use the tools of the chapter to rearrange the vector so that all elements conform to the standard first_name last_name.
##extract a vector of fname elements
fname <- unlist(str_extract_all(name, "[:punct:] [[:alpha:]]{2,}$|[[:alpha:]]{2,} "))
fname
## [1] "Moe " ". Montgomery" "Timothy " "Ned "
## [5] ", Homer" "Julius "
fname <- unlist(str_extract_all(fname,"[[:alpha:]]{2,}"))
fname
## [1] "Moe" "Montgomery" "Timothy" "Ned" "Homer"
## [6] "Julius"
##extract a vector of lname elements
lname <- unlist(str_extract_all(name, "[^[:punct:]] [[:alpha:]]{2,}$|[[:alpha:]]{2,}, "))
lname <- unlist(str_extract_all(lname,"[[:alpha:]]{2,}"))
lname
## [1] "Szyslak" "Burns" "Lovejoy" "Flanders" "Simpson" "Hibbert"
##extract a vector of title elements
title <- unlist(str_extract_all(name, "[[:alpha:]]{2,}\\."))
title
## [1] "Rev." "Dr."
title_exists <- unlist(str_detect(name, title))
title_exists
## [1] FALSE FALSE TRUE FALSE FALSE TRUE
##update the name dataframe
data.frame(fname=fname, lname=lname, title_exists=title_exists)
## fname lname title_exists
## 1 Moe Szyslak FALSE
## 2 Montgomery Burns FALSE
## 3 Timothy Lovejoy TRUE
## 4 Ned Flanders FALSE
## 5 Homer Simpson FALSE
## 6 Julius Hibbert TRUE
fname2 <- unlist(str_extract_all(name, " [[:alpha:]]{1}\\.? [[:alpha:]]{1,}\\.?"))
fname2 <- unlist(str_extract_all(fname2,"[[:alpha:]]{1}\\.? [[:alpha:]]{1,}\\.?"))
fname2
## [1] "C. Montgomery"
fname2_exists <- unlist(str_detect(name, " [[:alpha:]]{1}\\.? [[:alpha:]]{1,}\\.?"))
fname2_exists
## [1] FALSE TRUE FALSE FALSE FALSE FALSE
data.frame(fname=fname, lname=lname, phone=phone, title_exists=title_exists, fname2_exists=fname2_exists)
## fname lname phone title_exists fname2_exists
## 1 Moe Szyslak 555-1239 FALSE FALSE
## 2 Montgomery Burns (636) 555-0113 FALSE TRUE
## 3 Timothy Lovejoy 555-6542 TRUE FALSE
## 4 Ned Flanders 555 8904 FALSE FALSE
## 5 Homer Simpson 636-555-3226 FALSE FALSE
## 6 Julius Hibbert 5543642 TRUE FALSE
- Problem #7 The expression “<.+>” doesn’t repeat the alpha characters contained in the tag
raw.data <- "<title>+++BREAKING NEWS+++</title>"
html <- str_extract(raw.data,"<[[:alpha:]]{1,}>")
html
## [1] "<title>"
- Problem #8 The expression [^1-9=+*()] is replaced with “[^[1-9]{1,}]” which captures =+*()
raw.data <- "(5-3)^2=5^2-2*5*3+3^2"
binomial <- unlist(str_extract_all(raw.data, "[^[1-9]{1,}]"))
binomial
## [1] "(" "-" ")" "^" "=" "^" "-" "*" "*" "+" "^"