data607_week3_assignment

Chap. 8: Regular Functions and Essential String Functions

8a) Rearrange the vector so that the names are in “firstname lastname”" format.

library(stringr)
library(datasets)

# First read the vector of names.
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert" 
print(raw.data)

## [1] "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"

# Extract a list of strings that consists of contiguous sequences of alphabetic characters
# and 1 space, of length > 1, (thus discarding non-alphabetic chars).
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))

# Remove the prefixes
name2 = str_replace(name, pattern = "Rev. |Dr. ", replacement = "")

# Remove the Initials
name3 = str_replace(name2, pattern = "[[:alpha:]]\\.", replacement = "")

final_names <- vector(mode = "character", length = 6)

for (i in 1:6) {
    if (str_detect(name3[i], ",")) {
        # Switch the order of (first, last) if comma found.
        name4 = unlist(str_split(name3[i], ","))
        first = name4[2]
        last = name4[1]
    } else {
        name4 = unlist(str_split(name3[i], " "))
        first = name4[1]
        last = name4[2]
    }
    # Assemble the name vector back from firstname, lastname
    final_names[i] = paste(str_trim(first), last, sep = " ")
}
print(final_names)

## [1] "Moe Szyslak"      "Montgomery Burns" "Timothy Lovejoy" 
## [4] "Ned Flanders"     "Homer Simpson"    "Julius Hibbert"

Find which names have titles.

titles = str_detect(name, "Rev|Dr|Mr|Mrs|Ms|Prof")
print(titles)

## [1] FALSE FALSE  TRUE FALSE FALSE  TRUE

Find which names have second names. (Assuming that “second name” is same as last name.)

second_names = str_detect(final_names, " ")
print(second_names)

## [1] TRUE TRUE TRUE TRUE TRUE TRUE

4(a) Describe the type of string specified by the regular expression [0-9]+\$

This describes a regular expression whose first character is a digit 0 through 9, repeated one or more times, followed by a dollar sign, which is also the last character. An example is the string “9120$”.

s = "9120$"
t = str_extract(s, "[0-9]+\\$")
print(t == s)

## [1] TRUE

4(b) Describe the type of string specified by the regular expression \b[a-z]{1,4}\b

Regular expression which has between 1 and 4 lowercase alphabetic characters An example is the string “abc”.

s = "abc"
t = str_extract(s, "\\b[a-z]{1,4}\\b")
print(t == s)

## [1] TRUE

4(c) Describe the type of string specified by the regular expression .*?\.txt$

This describes a string consisting of any character (“.”), repeated 0 or more times (“*“), where this part is optional (”?“) and occurs at most once, followed by the string”.txt“. An example is the string”animals.txt“.

s = "animals.txt"
t = str_extract(s, ".*?\\.txt$")
print(t == s)

## [1] TRUE

4(d) Describe the type of string specified by the regular expression \d{2}/\d{2}/\d{4}

This is a regular expression that contains 2 digits, followed by the symbol “/”, followed by 2 digits, followed by “/”, followed by 4 digits. An example is “789/12/4567”

s = "789/12/4567"
print(str_detect(s, "\\d{2}/\\d{2}/\\d{4}"))

## [1] TRUE

4(e) Describe the type of string specified by the regular expression <(.+?)>.+?</\1> Any sequence of characters that starts with a “tag” and ends with “/tag” Example is “ This is an example

s = "<para>This is an example</para>"
t = str_extract(s, "<(.+?)>.+?</\\1>")
print(t)

## [1] "<para>This is an example</para>"

s = "<para>This is not an example!</parX>"
t = str_extract(s, "<(.+?)>.+?</\\1>")
print(t)

## [1] NA

Find hidden message in string using regular expressions. Note: I could not find this string on the text book’s website (http://www.r-datacollection.com/bookmaterials.html). I copied the image from the homework page into a Microsoft OneNote document, then used the OCR feature of OneNote to convert into text.

s1 = "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpnøTanwo
Uwisdij7Lj8kpf03AT5Idr3cocøbt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO
d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5
fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
print(s1)

## [1] "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpnøTanwo\nUwisdij7Lj8kpf03AT5Idr3cocøbt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO\nd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5\nfy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"

# print out all the uppercase letters
s2 = str_extract_all(s1, "[A-Z]")
print(s2)

## [[1]]
##  [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "Y" "O"
## [18] "U" "A" "R" "E" "A" "S" "U" "P" "E" "R" "N" "E" "R" "D"

data607_week3_assignment

Vikas Sinha

September 17, 2017