CUNY MSDS Data 607 HW3

Libraries:

library(stringr)
library(tidyverse)
library(kableExtra)
library(knitr)
library(plyr)

3.) Copy the introductory example. The vector name stores the extracted names.

Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.
Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).
Construct a logical vector indicating whether a character has a second name.

raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"

(a) Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.

#pull out words
original_names <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
#remove middle names - detect a space, letter, and a '.', then replace with blank
names1 <- str_replace(original_names, "\\s[A-z]\\. ", " ")
#rearrange to first name last name - find word groups then swap them
names2 <- str_replace(names1, "(\\w+),\\s(\\w+)", "\\2 \\1")
#remove title: search for a-z{2-3letters} followed by a '.', then replace with blank
new_names <- str_replace(names2, "[A-z]{2,3}\\. ", " ")

originaln <- data.frame(original_names)
newn <- data.frame(new_names)

kable(list(originaln, newn), "html", escape = F) %>%
  kable_styling("striped", full_width = T) %>%
  column_spec(1, bold = T) %>%
  row_spec(1, bold = T)

original_names
Moe Szyslak
Burns, C. Montgomery
Rev. Timothy Lovejoy
Ned Flanders
Simpson, Homer
Dr. Julius Hibbert

new_names
Moe Szyslak
Montgomery Burns
Timothy Lovejoy
Ned Flanders
Homer Simpson
Julius Hibbert

(b) Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).

title <- str_detect(names2,"[A-z]{2,3}\\. ")
df1 <- data.frame(names2, title)
kable(df1, "html", escape = F) %>%
  kable_styling("striped", full_width = T)

names2	title
Moe Szyslak	FALSE
Montgomery Burns	FALSE
Rev. Timothy Lovejoy	TRUE
Ned Flanders	FALSE
Homer Simpson	FALSE
Dr. Julius Hibbert	TRUE

(c) Construct a logical vector indicating whether a character has a second name.

second_name <- str_detect(original_names, "[A-Z]{1}\\.")
df2 <- data.frame(original_names, second_name)
kable(df2, "html", escape = F) %>%
  kable_styling("striped", full_width = T)

original_names	second_name
Moe Szyslak	FALSE
Burns, C. Montgomery	TRUE
Rev. Timothy Lovejoy	FALSE
Ned Flanders	FALSE
Simpson, Homer	FALSE
Dr. Julius Hibbert	FALSE

4.) Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.

[0-9]+\$
\b[a-z]{1,4}\b
.*?\.txt$
\d{2}/\d{2}/\d{4}
<(.+?)>.+?</\1>

(a) [0-9]+\$ [0-9]select digits 0-9, + - one or more times, $

reg_a <- "534435457$saf!@#3123$"
str_extract_all(reg_a, "[0-9]+\\$" )

## [[1]]
## [1] "534435457$" "3123$"

(b) \b[a-z]{1,4}\ //b Word Edge, [a-z] letters a-z lowercase, {1,4} min,max chars

reg_b <- "nick hale y amanda nic khaleyamanda HI"
str_extract_all(reg_b, "\\b[a-z]{1,4}\\b")

## [[1]]
## [1] "nick" "hale" "y"    "nic"

(c) .*?\.txt$ Pulls out files: . any character except line break, /* zero or more times, //. literal ., $ end of line

reg_c <- "c:/local/nic/test.txt"
str_extract_all(reg_c, ".*?\\.txt$")

## [[1]]
## [1] "c:/local/nic/test.txt"

(d) \d{2}/\d{2}/\d{4} Pulls out dates: //d any digit, {2} 2 characters, / a ‘/’

rec_d <- " 02/10/2018"
str_extract_all(rec_d, "\\d{2}/\\d{2}/\\d{4}")

## [[1]]
## [1] "02/10/2018"

(e) <(.+?)>.+?</\1> Pulls out html code:

reg_e <- "<p>this is a paragraph</p>"
str_extract_all(reg_e, "<(.+?)>.+?</\\1>")

## [[1]]
## [1] "<p>this is a paragraph</p>"

9.) The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others!

mystery <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
#pull out all capital letters and periods
mystery1 <- unlist(str_extract_all(mystery, "[[A-Z].]"))
#Join the letters together, remove the spaces
mystery1 <- paste(mystery1, collapse = "")
#replaces the '.'s with a space
str_replace_all(mystery1, "[.]", " ")

## [1] "CONGRATULATIONS YOU ARE A SUPERNERD"