library(stringr)
3.
raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5543642Dr. Julius Hibbert"
names <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
?? Question to Andy - is there a vectorized approach to solving this?
first_last = NULL # initialize an empty character vector
for (name in names) { # loop through each name within names vector
name_split = unlist(str_split(name, ', ')) # separate strings that contain commas; then flatten list
if (length(name_split) > 1) { # if name contains a comma and is split
name_split = paste(name_split[2], name_split[1]) # then reverse the order of the name components
}
first_last = c(first_last, name_split) # append revised names to new character vector
}
first_last
## [1] "Moe Szyslak" "C. Montgomery Burns" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Homer Simpson" "Dr. Julius Hibbert"
has_title = NULL # initialize an empty vector
for (name in names) {
has_title = append(has_title, str_detect(name, '[:alpha:]{2,}[.]')) # look for pattern of >1 letters ending in a period
}
data.frame(first_last, has_title)
## first_last has_title
## 1 Moe Szyslak FALSE
## 2 C. Montgomery Burns FALSE
## 3 Rev. Timothy Lovejoy TRUE
## 4 Ned Flanders FALSE
## 5 Homer Simpson FALSE
## 6 Dr. Julius Hibbert TRUE
has_second_name = str_detect(names, '[A-Z]\\.')
data.frame(first_last, has_second_name)
## first_last has_second_name
## 1 Moe Szyslak FALSE
## 2 C. Montgomery Burns TRUE
## 3 Rev. Timothy Lovejoy FALSE
## 4 Ned Flanders FALSE
## 5 Homer Simpson FALSE
## 6 Dr. Julius Hibbert FALSE
4.
[0-9]+\\$ – Find patterns of lines that end with one or more numbers, although it only works for me without the double backslashes.test = 'asfjsk23423 ask2 asdkf 123sdf sdf123'
str_extract_all(test, '[0-9]+\\$')
## [[1]]
## character(0)
str_extract_all(test, '[0-9]+$')
## [[1]]
## [1] "123"
\\b[a-z]{1,4}\\b – Find groups of adjacent letters between 1 to 4 characters long.str_extract_all('abc abcd abcdef 123abc abc123', '\\b[a-z]{1,4}\\b')
## [[1]]
## [1] "abc" "abcd"
.*?\\.txt$ – Find lines that end in .txt.str_extract_all('abc123.txt', '.*?\\.txt$')
## [[1]]
## [1] "abc123.txt"
\\d{2}/\\d{2}/\\d{4} – Find groups of two digits, two digits and four digits all separated by forward slashes.str_extract_all('12/34/1234 1/1/1234 12/34/12 43/32/4321', '\\d{2}/\\d{2}/\\d{4}')
## [[1]]
## [1] "12/34/1234" "43/32/4321"
<(.+?)>.+?</\\1> – Find patterns between and including matching, opening and closing HTML angle brackets.str_extract_all('<head>Some heaader text</head>', '<(.+?)>.+?</\\1>')
## [[1]]
## [1] "<head>Some heaader text</head>"
str_extract_all('<head>Some heaader text</typo>', '<(.+?)>.+?</\\1>')
## [[1]]
## character(0)
9.
code = 'clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr'
str_extract_all(code, '[[:upper:]]')
## [[1]]
## [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "Y" "O"
## [18] "U" "A" "R" "E" "A" "S" "U" "P" "E" "R" "N" "E" "R" "D"