Description - Please deliver links to an R Markdown file (in GitHub and rpubs.com) with solutions to problems 3 and 4 from chapter 8 of Automated Data Collection in R. Problem 9 is extra credit. You may work in a small group, but please submit separately with names of all group participants in your submission.

library(stringr)

3

raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"

name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))

name

## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.

fn_ln <- unlist(sub("([[:alnum:]_]{1,}),  *([[:alnum:]_]{1,})", "\\2 \\1", sub("([[:alnum:]_]{1,})\\. "," ",name)))
fn_ln

## [1] "Moe Szyslak"      "Montgomery Burns" " Timothy Lovejoy"
## [4] "Ned Flanders"     "Homer Simpson"    " Julius Hibbert"

Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).

title_vec <- c(str_detect(name, "(^[[:alnum:]_]{1,}\\.).*"))
name_vs_title <- data.frame(name,title_vec)
name_vs_title

##                   name title_vec
## 1          Moe Szyslak     FALSE
## 2 Burns, C. Montgomery     FALSE
## 3 Rev. Timothy Lovejoy      TRUE
## 4         Ned Flanders     FALSE
## 5       Simpson, Homer     FALSE
## 6   Dr. Julius Hibbert      TRUE

Construct a logical vector indicating whether a character has a second name.

second_name <- str_detect(name, " [[:alnum:]_]{1,}\\. ")
second_name_df <- data.frame(name, second_name)
second_name_df

##                   name second_name
## 1          Moe Szyslak       FALSE
## 2 Burns, C. Montgomery        TRUE
## 3 Rev. Timothy Lovejoy       FALSE
## 4         Ned Flanders       FALSE
## 5       Simpson, Homer       FALSE
## 6   Dr. Julius Hibbert       FALSE

4

[0=9]+\$

any number of digits followed by $

str_extract("asaxsas1231231$", "[0-9]+\\$")

## [1] "1231231$"

\b[a-z]{1,4}\b

string lower case alphabet, 1-4 letters

strings <- c("abcd", "ab", "a", "1234")
str_extract(strings, "\\b[a-z]{1,4}\\b")

## [1] "abcd" "ab"   "a"    NA

.*?\.txt$

returns strings ending in “.txt”

strings <- c("abc.txt", "abc.csv", ".txt")
str_extract(strings, ".*?\\.txt$")

## [1] "abc.txt" NA        ".txt"

\d{2}/\d{2}/\d{4}

Returns digits in the date format “mm/dd/yyyy”

strings <- c("09251996", "9/25/1996", "09/25/1996")
str_extract(strings, "\\d{2}/\\d{2}/\\d{4}")

## [1] NA           NA           "09/25/1996"

<(.+?)>.+?</\1>

returns HMTL/XML tag - string

strings <-c("<title>Title of the document</title>", "<title>Title of the document<title>", "<title>Title of the document")
str_extract(strings, "<(.+?)>.+?</\\1>")

## [1] "<title>Title of the document</title>"
## [2] NA                                    
## [3] NA

Assignment 3 - Data607

3

4