DATA 607 - Homework #3 - Zach Alexander

Libraries used:

library(stringr)
library(knitr)

PROBLEM #3: Copy the introductory example. The vector ‘name’ stores the extracted names.

Reading in variables from the text:

raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"

name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
phone <- unlist(str_extract_all(raw.data, "\\(?(\\d{3})?\\)?(-| )?\\d{3}(-| )?\\d{4}"))

Problem #3.1. Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.

## The following steps put the names into first_name last_name order, but also remove second names and titles

# remove the letter indicating the second name ("Burns, C. Mongomery" to "Burns, Montgomery")
name_2 <- unlist(str_replace(name, " \\w{1}\\.", ""))

# for names with commas, locate and flip from last, first to first last
name_3 <- unlist(str_replace(name_2, "(\\w+),\\s(\\w+)", "\\2 \\1"))

# remove titles
just_first_last_name <- unlist(str_replace(name_3, "\\w{1,}\\. ", ""))

# name vector with just first_name and last_name
just_first_last_name

## [1] "Moe Szyslak"      "Montgomery Burns" "Timothy Lovejoy" 
## [4] "Ned Flanders"     "Homer Simpson"    "Julius Hibbert"

## The following steps put the names into first_name last_name order, and include second names and titles in proper order

# locate name with a second name and a comma separating last and first, flip the order to first and last ("Burns, C. Montgomery" to "C. Montgomery Burns")
name_5 <- unlist(str_replace(name, "(\\w+),(\\s(\\w).\\s\\w+)", "\\2 \\1"))

# take this same character and flip the first and second parts so the second name is in the middle ("C. Montgomery Burns" to "Montgomery C. Burns")
name_6 <- unlist(str_replace(name_5, "(\\s\\w.\\s)(\\w+)", "\\2\\1"))

# locate name with a comma separating last and first, flip the order to first and last ("Simpson, Homer" to "Homer Simpson")
name_7 <- unlist(str_replace(name_6, "(\\w+),\\s(\\w+)", "\\2 \\1"))

# remove extra spaces between names
first_last_name_extra <- unlist(str_replace(name_7, "\\s{2}", " "))

first_last_name_extra

## [1] "Moe Szyslak"          "Montgomery C. Burns"  "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Homer Simpson"        "Dr. Julius Hibbert"

Problem #3.2. Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).

has_title <- unlist(str_detect(name, "(\\w+).\\s\\w+\\s\\w+"))
has_title

## [1] FALSE FALSE  TRUE FALSE FALSE  TRUE

Problem #3.3. Construct a logical vector indicating whether a character has a second name.

has_second_name <- unlist(str_detect(name, "\\w+,\\s\\w.\\s\\w+"))
has_second_name

## [1] FALSE  TRUE FALSE FALSE FALSE FALSE

You can find all three solutions in the following data frame:

final_table <- data.frame(name, just_first_last_name, first_last_name_extra, has_title, has_second_name)
kable(final_table)

name	just_first_last_name	first_last_name_extra	has_title	has_second_name
Moe Szyslak	Moe Szyslak	Moe Szyslak	FALSE	FALSE
Burns, C. Montgomery	Montgomery Burns	Montgomery C. Burns	FALSE	TRUE
Rev. Timothy Lovejoy	Timothy Lovejoy	Rev. Timothy Lovejoy	TRUE	FALSE
Ned Flanders	Ned Flanders	Ned Flanders	FALSE	FALSE
Simpson, Homer	Homer Simpson	Homer Simpson	FALSE	FALSE
Dr. Julius Hibbert	Julius Hibbert	Dr. Julius Hibbert	TRUE	FALSE

PROBLEM #4 Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.

Problem #4.1

expression <- "[0-9]+\\$"

# The description of the expression: At least one number followed by a "$" sign.

examples <- c("873212233$", "3$", "abd3ek1$")

# Using str_detect, these should return TRUE if all examples abide by the regular expression criteria
str_detect(examples, expression)

## [1] TRUE TRUE TRUE

Problem #4.2

expression <- "\\b[a-z]{1,4}\\b"

# The description of the expression: A string of 1 to 4 lowercase letters. The string length has to be at least one lowercase letter, and no more than 4 lowercase letters. There can be no uppercase characters.

examples <- c("dgfg", "road", "help")

# Using str_detect, these should return TRUE if all examples abide by the regular expression criteria
str_detect(examples, expression)

## [1] TRUE TRUE TRUE

Problem #4.3

expression <- ".*?\\.txt$"

# The description of the expression: Any string pattern, but is also optional, followed by ".txt" at the very end of the string - the "$" denotes the end of the string.

examples <- c(".txt", "dafd$#@;.txt", "adas45322.txt")

# Using str_detect, these should return TRUE if all examples abide by the regular expression criteria
str_detect(examples, expression)

## [1] TRUE TRUE TRUE

Problem #4.4

expression <- "\\d{2}/\\d{2}/\\d{4}"

# The description of the expression: Two digits, followed by a slash (/), followed by two digits, followed by a slash (/), followed by four digits. Format = xx/xx/xxxx.

examples <- c("12/23/2018", "04/23/1980", "05/12/1990")

# Using str_detect, these should return TRUE if all examples abide by the regular expression criteria
str_detect(examples, expression)

## [1] TRUE TRUE TRUE

Problem #4.5

expression <- "<(.+?)>.+?</\\1>"

# The description of the expression: One or more characters inside of <>, followed by another string of characters, followed by a string of characters the same way as the first set, inside of <>, however this set has a "/" at the beginning of the string inside of <>. For instance, this format runs the pattern of html markup <div>Something</div>.

examples <- c("<div>Something</div>", "<section>Test paragraph</section>", "<html><body></body></html>")

# Using str_detect, these should return TRUE if all examples abide by the regular expression criteria
str_detect(examples, expression)

## [1] TRUE TRUE TRUE

EXTRA CREDIT: The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing that others! The code snippet is also available in the materials at www.r-datacollection.com.

(I had some help separating the final ‘concatString’ (see below) by referencing this source. This helped me identify the periods in the secret code, which I eventually replaced with spaces.)

secret_message <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"

# a couple tests using regular expressions

# extract all letters, not case sensitive
all_letters <- unlist(str_extract_all(secret_message, "[[:alpha:]]{1,}"))

# extract all digits
all_digits <- unlist(str_extract_all(secret_message, "[[:digit:]]{1,}"))

# extract all lowercase letters
all_lowercase_letters <- unlist(str_extract_all(secret_message, "[[:lower:]]{1,}"))

# extract all uppercase letters
all_uppercase_letters <- unlist(str_extract_all(secret_message, "[[:upper:]]{1,}"))

# after referring to some help from other sources, I noticed that periods also can be extracted to eventualy remove spaces.
all_uppercase_letters_and_periods <- unlist(str_extract_all(secret_message, "[[:upper:].]"))

# found the secret message! I also noticed that periods can also be extracted to eventually make spaces in the final code. Let's clean it up a bit:
concatString <- paste(all_uppercase_letters_and_periods, collapse = "")

# replace all the periods with spaces to create the final solution:
solution <- str_replace_all(concatString, "[.]", " ")
solution

## [1] "CONGRATULATIONS YOU ARE A SUPERNERD"