CUNY MSDS Data 607 HW3

Nicholas Schettini

February 11, 2018

Libraries:

library(stringr)
library(tidyverse)
library(kableExtra)
library(knitr)
library(plyr)

3.) Copy the introductory example. The vector name stores the extracted names.

  1. Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.
  2. Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).
  3. Construct a logical vector indicating whether a character has a second name.
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"

(a) Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.

#pull out words
original_names <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
#remove middle names - detect a space, letter, and a '.', then replace with blank
names1 <- str_replace(original_names, "\\s[A-z]\\. ", " ")
#rearrange to first name last name - find word groups then swap them
names2 <- str_replace(names1, "(\\w+),\\s(\\w+)", "\\2 \\1")
#remove title: search for a-z{2-3letters} followed by a '.', then replace with blank
new_names <- str_replace(names2, "[A-z]{2,3}\\. ", " ")
originaln <- data.frame(original_names)
newn <- data.frame(new_names)
kable(list(originaln, newn), "html", escape = F) %>%
  kable_styling("striped", full_width = T) %>%
  column_spec(1, bold = T) %>%
  row_spec(1, bold = T)
original_names
Moe Szyslak
Burns, C. Montgomery
Rev. Timothy Lovejoy
Ned Flanders
Simpson, Homer
Dr. Julius Hibbert
new_names
Moe Szyslak
Montgomery Burns
Timothy Lovejoy
Ned Flanders
Homer Simpson
Julius Hibbert

(b) Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).

title <- str_detect(names2,"[A-z]{2,3}\\. ")
df1 <- data.frame(names2, title)
kable(df1, "html", escape = F) %>%
  kable_styling("striped", full_width = T) 
names2 title
Moe Szyslak FALSE
Montgomery Burns FALSE
Rev. Timothy Lovejoy TRUE
Ned Flanders FALSE
Homer Simpson FALSE
Dr. Julius Hibbert TRUE

(c) Construct a logical vector indicating whether a character has a second name.

second_name <- str_detect(original_names, "[A-Z]{1}\\.")
df2 <- data.frame(original_names, second_name)
kable(df2, "html", escape = F) %>%
  kable_styling("striped", full_width = T) 
original_names second_name
Moe Szyslak FALSE
Burns, C. Montgomery TRUE
Rev. Timothy Lovejoy FALSE
Ned Flanders FALSE
Simpson, Homer FALSE
Dr. Julius Hibbert FALSE

4.) Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.

  1. [0-9]+\$
  2. \b[a-z]{1,4}\b
  3. .*?\.txt$
  4. \d{2}/\d{2}/\d{4}
  5. <(.+?)>.+?</\1>  

(a) [0-9]+\$ [0-9]select digits 0-9, + - one or more times, $

reg_a <- "534435457$saf!@#3123$"
str_extract_all(reg_a, "[0-9]+\\$" )
## [[1]]
## [1] "534435457$" "3123$"

(b) \b[a-z]{1,4}\ //b Word Edge, [a-z] letters a-z lowercase, {1,4} min,max chars

reg_b <- "nick hale y amanda nic khaleyamanda HI"
str_extract_all(reg_b, "\\b[a-z]{1,4}\\b")
## [[1]]
## [1] "nick" "hale" "y"    "nic"

(c) .*?\.txt$ Pulls out files: . any character except line break, /* zero or more times, //. literal ., $ end of line

reg_c <- "c:/local/nic/test.txt"
str_extract_all(reg_c, ".*?\\.txt$")
## [[1]]
## [1] "c:/local/nic/test.txt"

(d) \d{2}/\d{2}/\d{4} Pulls out dates: //d any digit, {2} 2 characters, / a ‘/’

rec_d <- " 02/10/2018"
str_extract_all(rec_d, "\\d{2}/\\d{2}/\\d{4}")
## [[1]]
## [1] "02/10/2018"

(e) <(.+?)>.+?</\1> Pulls out html code:

reg_e <- "<p>this is a paragraph</p>"
str_extract_all(reg_e, "<(.+?)>.+?</\\1>")
## [[1]]
## [1] "<p>this is a paragraph</p>"

9.) The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others!

mystery <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
#pull out all capital letters and periods
mystery1 <- unlist(str_extract_all(mystery, "[[A-Z].]"))
#Join the letters together, remove the spaces
mystery1 <- paste(mystery1, collapse = "")
#replaces the '.'s with a space
str_replace_all(mystery1, "[.]", " ")
## [1] "CONGRATULATIONS YOU ARE A SUPERNERD"