library(stringr)
raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
custxtfun <- function(string) {
if (str_detect(string,",") == TRUE) {
string_split = strsplit(as.character(string), split = ", | ")
reversed_split = rev(string_split[[1]])
reversed_string = paste(reversed_split, collapse = " ")
}else {reversed_string = string }
if (str_detect(word(reversed_string,1),"Rev.|Dr.") == TRUE) {
finalstring = paste(word(reversed_string,2),word(reversed_string,-1))
}
else {finalstring = paste(word(reversed_string,1),word(reversed_string,-1))
}
return(finalstring)
}
namelist <- sapply(name, custxtfun, USE.NAMES = FALSE)
print(namelist)
## [1] "Moe Szyslak" "Montgomery Burns" "Timothy Lovejoy"
## [4] "Ned Flanders" "Homer Simpson" "Julius Hibbert"
has_title <- str_detect(name,"Rev.|Dr.")
data.frame(name,has_title)
has_sname <- str_detect(name,"([A-Z])\\. ")
data.frame(name,has_sname)
#The expression will extract any sequence of 1 to n digits followed by a dollar sign. This could be used to extract dollar amounts.
str_extract(
c( #Sample strings
"10$",
"1234567$",
"1 asdf 123$",
"$",
"$123"),
"[0-9]+\\$" #Regex
)
## [1] "10$" "1234567$" "123$" NA NA
#The expression will return the first sequence of 1 to 4 lower-case letters.
str_extract(
c( #Sample strings
"a",
"abcd",
"as d",
"abcdef",
"ab3d"
),
"\\b[a-z]{1,4}\\b" #Regex
)
## [1] "a" "abcd" "as" NA NA
#The expression will extract strings which are terminated by .txt
str_extract(
c( #Sample strings
"file1.exe",
"file",
"letter3.txt"
),
".*?\\.txt$" #Regex
)
## [1] NA NA "letter3.txt"
# The expression could be used to extract dates of forma DD/MM/YYYY or MM/DD/YYYY.
str_extract(
c( #Sample strings
"1/1/1",
"01/30/2000",
"30/01/2000"
),
"\\d{2}/\\d{2}/\\d{4}" #Regex
)
## [1] NA "01/30/2000" "30/01/2000"
# The expression will extract XML tags.
str_extract_all("Content outside the tag <xmltag1>content</xmltag1> and other<xmltag2>content</xmltag2> elements 123","<(.+?)>.+?</\\1>")
## [[1]]
## [1] "<xmltag1>content</xmltag1>" "<xmltag2>content</xmltag2>"
9.The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com.
secret <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"