Description - Please deliver links to an R Markdown file (in GitHub and rpubs.com) with solutions to problems 3 and 4 from chapter 8 of Automated Data Collection in R. Problem 9 is extra credit. You may work in a small group, but please submit separately with names of all group participants in your submission.
library(stringr)
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
fn_ln <- unlist(sub("([[:alnum:]_]{1,}), *([[:alnum:]_]{1,})", "\\2 \\1", sub("([[:alnum:]_]{1,})\\. "," ",name)))
fn_ln
## [1] "Moe Szyslak" "Montgomery Burns" " Timothy Lovejoy"
## [4] "Ned Flanders" "Homer Simpson" " Julius Hibbert"
title_vec <- c(str_detect(name, "(^[[:alnum:]_]{1,}\\.).*"))
name_vs_title <- data.frame(name,title_vec)
name_vs_title
## name title_vec
## 1 Moe Szyslak FALSE
## 2 Burns, C. Montgomery FALSE
## 3 Rev. Timothy Lovejoy TRUE
## 4 Ned Flanders FALSE
## 5 Simpson, Homer FALSE
## 6 Dr. Julius Hibbert TRUE
second_name <- str_detect(name, " [[:alnum:]_]{1,}\\. ")
second_name_df <- data.frame(name, second_name)
second_name_df
## name second_name
## 1 Moe Szyslak FALSE
## 2 Burns, C. Montgomery TRUE
## 3 Rev. Timothy Lovejoy FALSE
## 4 Ned Flanders FALSE
## 5 Simpson, Homer FALSE
## 6 Dr. Julius Hibbert FALSE
any number of digits followed by $
str_extract("asaxsas1231231$", "[0-9]+\\$")
## [1] "1231231$"
string lower case alphabet, 1-4 letters
strings <- c("abcd", "ab", "a", "1234")
str_extract(strings, "\\b[a-z]{1,4}\\b")
## [1] "abcd" "ab" "a" NA
returns strings ending in “.txt”
strings <- c("abc.txt", "abc.csv", ".txt")
str_extract(strings, ".*?\\.txt$")
## [1] "abc.txt" NA ".txt"
Returns digits in the date format “mm/dd/yyyy”
strings <- c("09251996", "9/25/1996", "09/25/1996")
str_extract(strings, "\\d{2}/\\d{2}/\\d{4}")
## [1] NA NA "09/25/1996"
returns HMTL/XML tag -
strings <-c("<title>Title of the document</title>", "<title>Title of the document<title>", "<title>Title of the document")
str_extract(strings, "<(.+?)>.+?</\\1>")
## [1] "<title>Title of the document</title>"
## [2] NA
## [3] NA