Grando 3 Homework

options(width = 100)
# This is a standard setup I include so that my working
# directory is set correctly whether I work on one of my
# windows or linux machines.
if (Sys.info()["sysname"] == "Windows") {
    setwd("~/Masters/DATA607/Week3/Assignment")
} else {
    setwd("~/Documents/Masters/DATA607/Week3/Assignment")
}

Load the Raw Text String and Process It as Performed In The Reading

library(stringr)
raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name_df <- data.frame(name)
colnames(name_df) <- ("original")
name_df
##               original
## 1          Moe Szyslak
## 2 Burns, C. Montgomery
## 3 Rev. Timothy Lovejoy
## 4         Ned Flanders
## 5       Simpson, Homer
## 6   Dr. Julius Hibbert

Problem 3.1 - Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name

has_comma <- function(x) {
    has_comma <- str_detect(string = x, pattern = ",")
}
convert_names <- function(x) {
    # This match would not work if a title was after the comma
    # (e.g. last_name, title first_name) but the data set in
    # quesion does not have this issue
    matched_string <- str_match(x, "([[:alpha:]]+\\.\\s)?([[:alpha:]]+),?\\s([[:alpha:]]+\\.?\\s?)([[:alpha:]]+\\.?\\s?)?")
    last_name <- ifelse(has_comma(x) == TRUE, matched_string[3], 
        matched_string[4])
    first_name <- ifelse(has_comma(x) == TRUE, matched_string[4], 
        matched_string[3])
    second_name <- ifelse(has_comma(x) == TRUE, matched_string[5], 
        matched_string[5])
    title <- ifelse(has_comma(x) == TRUE, matched_string[2], 
        matched_string[2])
    first_and_last_name <- paste(first_name, last_name, sep = " ")
    x <- c(last_name, first_name, second_name, title, first_and_last_name)
}
name_conv <- sapply(name_df$original, function(x) {
    convert_names(x)
})
name_df <- cbind(name_df, name_conv[1, ], name_conv[2, ], name_conv[3, 
    ], name_conv[4, ], name_conv[5, ])
colnames(name_df)[2:6] <- c("last_name", "first_name", "second_name", 
    "title", "first_and_last_name")
name_df
##               original last_name first_name second_name title first_and_last_name
## 1          Moe Szyslak   Szyslak        Moe        <NA>  <NA>         Moe Szyslak
## 2 Burns, C. Montgomery     Burns        C.   Montgomery  <NA>           C.  Burns
## 3 Rev. Timothy Lovejoy   Lovejoy    Timothy        <NA> Rev.      Timothy Lovejoy
## 4         Ned Flanders  Flanders        Ned        <NA>  <NA>        Ned Flanders
## 5       Simpson, Homer   Simpson      Homer        <NA>  <NA>       Homer Simpson
## 6   Dr. Julius Hibbert   Hibbert     Julius        <NA>  Dr.       Julius Hibbert
as.character(name_df$first_and_last_name)
## [1] "Moe Szyslak"     "C.  Burns"       "Timothy Lovejoy" "Ned Flanders"    "Homer Simpson"  
## [6] "Julius Hibbert"

Problem 3.2 - Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).

name_df$title_vector <- str_detect(string = name_df$original, 
    pattern = "^[[:alpha:]]+[.]")
name_df
##               original last_name first_name second_name title first_and_last_name title_vector
## 1          Moe Szyslak   Szyslak        Moe        <NA>  <NA>         Moe Szyslak        FALSE
## 2 Burns, C. Montgomery     Burns        C.   Montgomery  <NA>           C.  Burns        FALSE
## 3 Rev. Timothy Lovejoy   Lovejoy    Timothy        <NA> Rev.      Timothy Lovejoy         TRUE
## 4         Ned Flanders  Flanders        Ned        <NA>  <NA>        Ned Flanders        FALSE
## 5       Simpson, Homer   Simpson      Homer        <NA>  <NA>       Homer Simpson        FALSE
## 6   Dr. Julius Hibbert   Hibbert     Julius        <NA>  Dr.       Julius Hibbert         TRUE
name_df$title_vector
## [1] FALSE FALSE  TRUE FALSE FALSE  TRUE

Problem 3.3 - Construct a logical vector indicating whether a character has a second name.

name_df$second_name_test <- str_detect(name_df$original, pattern = "[^[:alpha:]][[:alpha:]]\\.\\s[[:alpha:]]+")
name_df
##               original last_name first_name second_name title first_and_last_name title_vector
## 1          Moe Szyslak   Szyslak        Moe        <NA>  <NA>         Moe Szyslak        FALSE
## 2 Burns, C. Montgomery     Burns        C.   Montgomery  <NA>           C.  Burns        FALSE
## 3 Rev. Timothy Lovejoy   Lovejoy    Timothy        <NA> Rev.      Timothy Lovejoy         TRUE
## 4         Ned Flanders  Flanders        Ned        <NA>  <NA>        Ned Flanders        FALSE
## 5       Simpson, Homer   Simpson      Homer        <NA>  <NA>       Homer Simpson        FALSE
## 6   Dr. Julius Hibbert   Hibbert     Julius        <NA>  Dr.       Julius Hibbert         TRUE
##   second_name_test
## 1            FALSE
## 2             TRUE
## 3            FALSE
## 4            FALSE
## 5            FALSE
## 6            FALSE
name_df$second_name_test
## [1] FALSE  TRUE FALSE FALSE FALSE FALSE

Problem 4 - Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.

4.1 - [0-9]+\\$

This regular expression matches one or more numbers that are ended with a $

problem_4_1_test_string <- c("453534$", "34556456")
unlist(str_match_all(problem_4_1_test_string, "[0-9]+\\$"))
## [1] "453534$"

4.2 - \\b[a-z]{1,4}\\b

This regular expression matches any lowercase words (alpha characters) that are between one and four letters long.

problem_4_2_test_string <- c("one", "two", "three", "four")
unlist(str_match_all(problem_4_2_test_string, "\\b[a-z]{1,4}\\b"))
## [1] "one"  "two"  "four"

4.3 - .*?\\.txt$

This regular expression matches any text (at most once) preceding the letters “.txt”, which must be at the end of the word.

problem_4_3_test_string <- c("txt", "file.txt", "file.csv")
unlist(str_match_all(problem_4_3_test_string, ".*?\\.txt$"))
## [1] "file.txt"

4.4 - \\d{2}/\\d{2}/\\d{4}

This regular expression matches any strings in date formate mm/dd/yyyy (or dd/mm/yyyy).

problem_4_4_test_string <- c("12/25/2016", "1/1/2017")
unlist(str_match_all(problem_4_4_test_string, "\\d{2}/\\d{2}/\\d{4}"))
## [1] "12/25/2016"

4.5 - <(.+?)>.+?</\\1>

This regular expression returns any text encased within html tags, which start with a “” and end with “?” However, it does not work when attributes are applied to tags.

problem_4_5_test_string <- c("<p> This text is black </p>", "<p> color=black> No match </p>")
unlist(str_match_all(problem_4_5_test_string, "<(.+?)>.+?</\\1>"))
## [1] "<p> This text is black </p>"    "p"                             
## [3] "<p> color=black> No match </p>" "p"

Problem 9 - The following code hides a secret message. Crack it with R and regular expressions. Hint: some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com.

secret_message <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
message <- cat(unlist(str_extract_all(secret_message, "([A-Z]+|[[:punct:]]){1,}")), 
    sep = "")
## CONGRATULATIONS.YOU.ARE.A.SUPERNERD!