This is the third week assignment, focusing on manipulating text in R.
library(stringr)
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
phone <- unlist(str_extract_all(raw.data, "\\(?(\\d{3})?\\)?(-| )?\\d{3}(-| )?\\d{4}"))
namephone_df <- data.frame(name = name, phone = phone)
Identify entries with a comma, indicating the arrangement is last_name, first_name and requires reordering and removal of commas using the trim function.
reorder_last = str_extract_all(name, "\\w+,")
reorder_first = str_extract_all(name, ", \\w.+$")
reordered_names = paste(reorder_first, reorder_last)
reordered_names_trim = str_trim(str_replace_all(reordered_names, ",", ""))
reordered_names_trim
## [1] "character(0) character(0)" "C. Montgomery Burns"
## [3] "character(0) character(0)" "character(0) character(0)"
## [5] "Homer Simpson" "character(0) character(0)"
comma_names = grep(",", name)
name[comma_names] <- reordered_names_trim[comma_names]
name
## [1] "Moe Szyslak" "C. Montgomery Burns" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Homer Simpson" "Dr. Julius Hibbert"
title_detect <- str_detect(name, "Dr.|Rev.")
title_detect
## [1] FALSE FALSE TRUE FALSE FALSE TRUE
First the titles must be removed or it will look like middle names
name_no_title = str_replace_all(name, pattern = "Rev. |Dr. ", replacement = "")
name_no_title
## [1] "Moe Szyslak" "C. Montgomery Burns" "Timothy Lovejoy"
## [4] "Ned Flanders" "Homer Simpson" "Julius Hibbert"
middle_detect <- str_detect(name_no_title, "\\. ")
middle_detect
## [1] FALSE TRUE FALSE FALSE FALSE FALSE
Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression. a) [0-9]+\$ b) \b[a-z]{1,4}\b c) .*?\.txt$ d) \d{2}/\d{2}/\d{4} e) <( .+?)>.+?<``/ \1>
This will return the number string just before dollar sign $, it will not return any alpha characters before a dollar sign.
a_df <- "I Got 99$ Problem$"
str_extract_all(a_df, '[0-9]+\\$')
## [[1]]
## [1] "99$"
This will return strings between one and four characters that are alpha and are between non-alpha or digit characters, such as spaces and punctuation.
b_df <- "i got 99 problems except you babe"
str_extract_all(b_df,"\\b[a-z]{1,4}\\b")
## [[1]]
## [1] "i" "got" "you" "babe"
c_df <- c("east.txt", "side to the", "west.txt", "side")
unlist(str_extract_all(c_df,".*?\\.txt$"))
## [1] "east.txt" "west.txt"
This will return any string with digits in the format of XX/XX/XXXX, describing a common structure for dates.
d_df <- "07/04/2017 12/24/17 12/25/2017"
unlist(str_extract_all(d_df,"\\d{2}/\\d{2}/\\d{4}"))
## [1] "07/04/2017" "12/25/2017"
This will return strings between html marks < and />.
e_df <- c("<p> I can't get her out of my mind </p>", "<I think about the girl all the time>")
unlist(str_extract_all(e_df,"<(.+?)>.+?</\\1>"))
## [1] "<p> I can't get her out of my mind </p>"
The following code hides a secret message. Crack it with regular expressions.
secret <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
unlist(str_extract_all(secret, "[a-z]"))
## [1] "c" "l" "c" "o" "p" "o" "w" "z" "m" "s" "t" "c" "d" "w" "n" "k" "i"
## [18] "g" "v" "d" "i" "c" "p" "u" "g" "g" "v" "h" "r" "y" "n" "j" "u" "w"
## [35] "c" "z" "i" "h" "q" "r" "f" "p" "x" "s" "j" "d" "w" "p" "n" "a" "n"
## [52] "w" "o" "w" "i" "s" "d" "i" "j" "j" "k" "p" "f" "d" "r" "c" "o" "c"
## [69] "b" "t" "y" "c" "z" "j" "a" "t" "a" "o" "o" "t" "j" "t" "j" "n" "e"
## [86] "c" "f" "e" "k" "r" "w" "w" "w" "o" "j" "i" "g" "d" "v" "r" "f" "r"
## [103] "b" "z" "b" "k" "n" "b" "h" "z" "g" "v" "i" "z" "c" "r" "o" "p" "w"
## [120] "g" "n" "b" "q" "o" "f" "a" "o" "t" "f" "b" "w" "m" "k" "t" "s" "z"
## [137] "q" "e" "f" "y" "n" "d" "t" "k" "c" "f" "g" "m" "c" "g" "x" "o" "n"
## [154] "h" "k" "g" "r"
No message appears obvious.
unlist(str_extract_all(secret, "[A-Z]"))
## [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "Y" "O"
## [18] "U" "A" "R" "E" "A" "S" "U" "P" "E" "R" "N" "E" "R" "D"
Message is clear, but punctuation needs to be retained.
answer_punc <- gsub("\\.","XX",secret)
answer <- unlist(str_extract_all(answer_punc, "[A-Z]"))
answer
## [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "X" "X"
## [18] "Y" "O" "U" "X" "X" "A" "R" "E" "X" "X" "A" "X" "X" "S" "U" "P" "E"
## [35] "R" "N" "E" "R" "D"
answer_together <- paste(answer, collapse="")
final_answer <- gsub("XX"," ",answer_together)
final_answer
## [1] "CONGRATULATIONS YOU ARE A SUPERNERD"