Please deliver links to an R Markdown file (in GitHub and rpubs.com) with solutions to problems 3 and 4 from chapter 8 of Automated Data Collection in R. Problem 9 is extra credit.
List library
library(stringr)
Below is the original raw data, a vector with strings of phone numbers and names
raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
raw.data
## [1] "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
name <- unlist(str_extract_all(raw.data, "[[:alpha:] ., ]{2,}"));name
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
fname_lname <- str_replace_all(name,"[[:alpha:]]{2,}\\.\\s",""); fname_lname
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Julius Hibbert"
x <- which(str_detect(fname_lname,",") %in% "TRUE")
fname_lname[x] <- str_c(str_trim(str_extract(fname_lname[x],"\\s([[:alpha:]]. )?(\\w+)")), str_extract(fname_lname[x],"(\\w+)"), sep = " "); fname_lname
## [1] "Moe Szyslak" "C. Montgomery Burns" "Timothy Lovejoy"
## [4] "Ned Flanders" "Homer Simpson" "Julius Hibbert"
str_detect(name,"[[:alpha:]]{2,}\\.\\s")
## [1] FALSE FALSE TRUE FALSE FALSE TRUE
str_detect(str_count(fname_lname,"\\w+"),"3")
## [1] FALSE TRUE FALSE FALSE FALSE FALSE
a) [0-9]+\\$ --> Pattern matching a string of 0-9 with a $ at the end the sequence
a <- c("003342$","3460.0$1$","%4352T4234$$","$2343423afsfsaf$","$")
a_ans <- str_extract(a,"[0-9]+\\$");a_ans
## [1] "003342$" "0$" "4234$" NA NA
b) \\b[a-z]{1,4}\\b --> Pattern matching a whole string of lower case letters having 1-4 character length
b <- c("1234", "1ds","42bs tyv 34DFDa","4234abcdefg","abcd","ABCD","s", "abcdef")
b_ans <- str_extract(b,"\\b[a-z]{1,4}\\b");b_ans
## [1] NA NA "tyv" NA "abcd" NA "s" NA
c) .*?\\.txt$ --> Pattern matching a sequence that ends in .txt
c <- c("dstxt",".txt$", "file.txt","any32.txt%#%ABC 434.txt",".txtsometext")
c_ans <- str_extract(c,".*?\\.txt$");c_ans
## [1] NA NA
## [3] "file.txt" "any32.txt%#%ABC 434.txt"
## [5] NA
d) \\d{2}/\\d{2}/\\d{4} --> pattern matching a sequence of 2 digit/2-digit/4-digit (could be similar to a date format)
d <- c(12/34/5678,"12/34/5678","01/05/0008","01/45/567","4/01/4321","12\31\2008","d/d/AD$")
d_ans <- str_extract(d,"\\d{2}/\\d{2}/\\d{4}");d_ans
## [1] NA "12/34/5678" "01/05/0008" NA NA
## [6] NA NA
e) <(.+?)>.+?</\\1> --> pattern matching a sequence of html pair tags
e <- c("<!DOCTYPE html><html><head><title>First HTML</title></head><body>I am your first HTML file!</body></html>")
e_ans <- str_extract(e,"<(.+?)>.+?</\\1>");e_ans
## [1] "<html><head><title>First HTML</title></head><body>I am your first HTML file!</body></html>"
secretmsg <- ("clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr")
msg <- str_replace_all(str_c(unlist(str_extract_all(secretmsg,"[[:upper:].!]")), collapse = ''),"\\."," ");msg
## [1] "CONGRATULATIONS YOU ARE A SUPERNERD!"