# Problem 3
# From Automated Data Collection with R; introductory example.
raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555 -6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
library(stringr)
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
# 3(a)
name = str_replace_all(name, pattern = "Rev. |Dr.", replacement = "")
View(name)
# From page 205 Automated Data Collection with R
last_names = str_extract_all(name, "\\W+,")
first_names = str_extract_all(name, ",\\w+")
right_order = paste(first_names, last_names)
comma_place = grep(name, ",")
## Warning in grep(name, ","): argument 'pattern' has length > 1 and only the
## first element will be used
name[comma_place] <- right_order[comma_place]
name = str_trim(str_replace_all(name, ",", ""))
name
## [1] "Moe Szyslak" "Burns C. Montgomery" "Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson Homer" "Julius Hibbert"
# 3(b)
# From page 209, Automated Data Collection with R
raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555 -6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name_titles = str_detect(name, "Rev.|Dr.")
name_titles
## [1] FALSE FALSE TRUE FALSE FALSE TRUE
# 3(c)
# From page 204, Automated Data Collection with R
name = str_replace_all(name, pattern = "Rev. |Dr.", replacement = "")
name_second <- str_detect(name, "\\.")
name_second
## [1] FALSE TRUE FALSE FALSE FALSE FALSE
name[name_second == TRUE]
## [1] "Burns, C. Montgomery"
# Problem 4
# (a) [0-9]+\\$: To match any digit number from 0-9 in the string
test1 <- "500$"
str_extract_all(test1, "[0-9]+\\$")
## [[1]]
## [1] "500$"
# (b) \\b[a-z]{1,4}\\b: To match lowercase letters to maximum 4 characters.
test2 <- "I am going to California next week for vacation."
str_extract_all(test2, "\\b[a-z]{1,4}\\b")
## [[1]]
## [1] "am" "to" "next" "week" "for"
# (c) .*?\\.txt$: To match the string that ending with ".txt"
test3 <- c(".txt", "tom.txt", "data607.txt", "data606.txt")
unlist(str_extract_all(test3, ".*?\\.txt$"))
## [1] ".txt" "tom.txt" "data607.txt" "data606.txt"
# (d) \\d{2}/\\d{2}/\\d{4}: To match the expression includes two digit/two digit/four digit, such as dates in format mm/dd/yyyy.
test3 <- c("03/05/1989", "12/31/1988", "Jan 10, 1992", "12/10/1959", "May 12, 1989")
unlist(str_extract_all(test3, "\\d{2}/\\d{2}/\\d{4}"))
## [1] "03/05/1989" "12/31/1988" "12/10/1959"
# (e) "<(.+?)>.+?</\\1>: To match expression start with < > and end with </ any string >.
test4 <- c("<script> R is fun </script>")
unlist(str_extract_all(test4, "<(.+?)>.+?</\\1>"))
## [1] "<script> R is fun </script>"