Chap. 8: Regular Functions and Essential String Functions
8a) Rearrange the vector so that the names are in “firstname lastname”" format.
library(stringr)
library(datasets)
# First read the vector of names.
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
print(raw.data)
## [1] "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
# Extract a list of strings that consists of contiguous sequences of alphabetic characters
# and 1 space, of length > 1, (thus discarding non-alphabetic chars).
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
# Remove the prefixes
name2 = str_replace(name, pattern = "Rev. |Dr. ", replacement = "")
# Remove the Initials
name3 = str_replace(name2, pattern = "[[:alpha:]]\\.", replacement = "")
final_names <- vector(mode = "character", length = 6)
for (i in 1:6) {
if (str_detect(name3[i], ",")) {
# Switch the order of (first, last) if comma found.
name4 = unlist(str_split(name3[i], ","))
first = name4[2]
last = name4[1]
} else {
name4 = unlist(str_split(name3[i], " "))
first = name4[1]
last = name4[2]
}
# Assemble the name vector back from firstname, lastname
final_names[i] = paste(str_trim(first), last, sep = " ")
}
print(final_names)
## [1] "Moe Szyslak" "Montgomery Burns" "Timothy Lovejoy"
## [4] "Ned Flanders" "Homer Simpson" "Julius Hibbert"
Find which names have titles.
titles = str_detect(name, "Rev|Dr|Mr|Mrs|Ms|Prof")
print(titles)
## [1] FALSE FALSE TRUE FALSE FALSE TRUE
Find which names have second names. (Assuming that “second name” is same as last name.)
second_names = str_detect(final_names, " ")
print(second_names)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE
4(a) Describe the type of string specified by the regular expression [0-9]+\$
This describes a regular expression whose first character is a digit 0 through 9, repeated one or more times, followed by a dollar sign, which is also the last character. An example is the string “9120$”.
s = "9120$"
t = str_extract(s, "[0-9]+\\$")
print(t == s)
## [1] TRUE
4(b) Describe the type of string specified by the regular expression \b[a-z]{1,4}\b
Regular expression which has between 1 and 4 lowercase alphabetic characters An example is the string “abc”.
s = "abc"
t = str_extract(s, "\\b[a-z]{1,4}\\b")
print(t == s)
## [1] TRUE
4(c) Describe the type of string specified by the regular expression .*?\.txt$
This describes a string consisting of any character (“.”), repeated 0 or more times (“*“), where this part is optional (”?“) and occurs at most once, followed by the string”.txt“. An example is the string”animals.txt“.
s = "animals.txt"
t = str_extract(s, ".*?\\.txt$")
print(t == s)
## [1] TRUE
4(d) Describe the type of string specified by the regular expression \d{2}/\d{2}/\d{4}
This is a regular expression that contains 2 digits, followed by the symbol “/”, followed by 2 digits, followed by “/”, followed by 4 digits. An example is “789/12/4567”
s = "789/12/4567"
print(str_detect(s, "\\d{2}/\\d{2}/\\d{4}"))
## [1] TRUE
4(e) Describe the type of string specified by the regular expression <(.+?)>.+?</\1> Any sequence of characters that starts with a “tag” and ends with “/tag” Example is “
"
s = "<para>This is an example</para>"
t = str_extract(s, "<(.+?)>.+?</\\1>")
print(t)
## [1] "<para>This is an example</para>"
s = "<para>This is not an example!</parX>"
t = str_extract(s, "<(.+?)>.+?</\\1>")
print(t)
## [1] NA
s1 = "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpnøTanwo
Uwisdij7Lj8kpf03AT5Idr3cocøbt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO
d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5
fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
print(s1)
## [1] "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpnøTanwo\nUwisdij7Lj8kpf03AT5Idr3cocøbt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO\nd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5\nfy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
# print out all the uppercase letters
s2 = str_extract_all(s1, "[A-Z]")
print(s2)
## [[1]]
## [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "Y" "O"
## [18] "U" "A" "R" "E" "A" "S" "U" "P" "E" "R" "N" "E" "R" "D"