DATA 607 - Assignment 3

library(tidyverse)
library(dbplyr)
library(kableExtra)


data607RegexChecker <- function(regex_strings,regex_pattern){
  
m <- str_extract_all(regex_strings,pattern=regex_pattern, simplify = TRUE)
m <- rename(as_tibble(m), 'match' = "V1")

m = m %>%
 mutate(String = regex_strings) %>%
 mutate(Match = ifelse(match=="",'No Match',match)) %>% 
 select(String, Match)

 kable(m, format = "markdown")  
  
}

3. Create Strings.

Use the following string to complete the exercises:

names <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"

names <- str_extract_all(names,'[[a-zA-Z],. ]{2,}')
names <- as_tibble(names, .name_repair="universal")
names <- rename(names, 'name_strings'= '...1')

names <- names %>%
  mutate(Number = row_number() ) %>%
  mutate(title = str_extract(name_strings,"^[a-zA-Z]{2,4}\\.")) %>% 
  mutate(name_strings = str_replace_all(name_strings, "^[a-zA-Z]{2,4}\\.\\s",'' )) %>% 
  mutate(first_name = str_trim(str_extract(name_strings,'^[a-zA-Z]+\\s?\\,?'))) %>% 
  mutate(last_name = str_trim(str_extract(name_strings,'(\\s.*)$'))) %>% 
  mutate(last_name = if_else(str_detect(first_name,'\\,'),first_name,last_name)) %>% 
  mutate(first_name = if_else(str_detect(first_name,'\\,'),str_trim(str_extract(name_strings,'(\\s.*)$')),first_name)) %>% 
  mutate(last_name = str_replace(last_name,'\\,','')) %>%
  mutate(first_name_last_name = if_else(is.na(title),str_c(first_name, last_name, sep=' '),str_c(title, first_name, last_name, sep=' ')))

Use the tools of this chapter to rearrange the vector so that all the elements conform to the standard first_name last_name

first_last <- names %>%
  select(first_name_last_name) 
  kable(first_last, format = "markdown")

first_name_last_name
Moe Szyslak
C. Montgomery Burns
Rev. Timothy Lovejoy
Ned Flanders
Homer Simpson
Dr. Julius Hibbert

Construct a logical vector indicating whether a character has a title (i.e. Rev., and Dr.)

title_vector <- names %>%
  mutate(has_title = str_detect(first_name_last_name, '^[a-zA-Z]{2,4}\\.')) %>% 
  select(first_name_last_name, has_title)

kable(title_vector, format = "markdown")

first_name_last_name	has_title
Moe Szyslak	FALSE
C. Montgomery Burns	FALSE
Rev. Timothy Lovejoy	TRUE
Ned Flanders	FALSE
Homer Simpson	FALSE
Dr. Julius Hibbert	TRUE

Construct a logical vector indicating whether a character has a second name.

second_name_vector <- names %>% 
  mutate(has_second_name = str_detect(first_name_last_name, '^[A-Z\\.]{2}\\s[a-zA-Z]+\\s[a-zA-z]+$')) %>% 
  select(first_name_last_name, has_second_name)

kable(second_name_vector, format = "markdown")

first_name_last_name	has_second_name
Moe Szyslak	FALSE
C. Montgomery Burns	TRUE
Rev. Timothy Lovejoy	FALSE
Ned Flanders	FALSE
Homer Simpson	FALSE
Dr. Julius Hibbert	FALSE

4. Types of Strings

Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.

Regular Expression Pattern: [0-9]+\$

Description: One to unlimited digits followed by a single dollar sign ($).

For example:

(data607RegexChecker(c("0$", "000r00?", "123456$", "1111111111$", "go_yankees!"),
                     "[0-9]+\\$" ))

## Warning: `as_tibble.matrix()` requires a matrix with column names or a `.name_repair` argument. Using compatibility `.name_repair`.
## This warning is displayed once per session.

String	Match
0$	0$
000r00?	No Match
123456$	123456$
1111111111$	1111111111$
go_yankees!	No Match

Regular Expression Pattern: \b[a-z]{1,4}\b

Description: This will match lower case character strings of length 1 to 4.

For example:

(data607RegexChecker(c("a", "abcd", "abcD", "go", "yank"),
                     "\\b[a-z]{1,4}\\b" ))

String	Match
a	a
abcd	abcd
abcD	No Match
go	go
yank	yank

Regular Expression Pattern: .*?\.txt$

Description: Matches any character except line termination 0 to unlimited times, followed by a period and finally followed by the characters txt. This appears to be a DOS statement that looking for all files that end in .txt.

For example:

(data607RegexChecker(c("resume.txt", "memo.txt", "spreadsheet.xls", "go.txt", "file.pdf"),
                     ".*?\\.txt$" ))

String	Match
resume.txt	resume.txt
memo.txt	memo.txt
spreadsheet.xls	No Match
go.txt	go.txt
file.pdf	No Match

Regular Expression Pattern: \d{2}/\d{2}/\d{4}

Description: Two digits followed by a back slash followed by Two digits followed by a back slash followed by four digits. In other word this could represent a tranditional date format.

For example:

(data607RegexChecker(c("04/07/1963", "05/19/1998", "09/11/2001", "8/7/1980", "09/15/2019"),
                     "\\d{2}/\\d{2}/\\d{4}" ))

String	Match
04/07/1963	04/07/1963
05/19/1998	05/19/1998
09/11/2001	09/11/2001
8/7/1980	No Match
09/15/2019	09/15/2019

Regular Expression Pattern: <(.+?)>.+?</\1>

Description: This matches html data.Intially it matches the beginning tag, any 1 to unlimited character(s) inside the first brackets (<>), next it matches any 1 to unlimited characters (between the brackets) and finally it matches the ending brackets and uses the result from the 1st capture group (\1) in the end bracket( </\1> ), thus eliminating the need to match something that has already been matched.

m <- str_extract_all(c("<H1> This is a header </H1>", "<p> This is a paragraph </p>","<date> 12/13/234</date>", "<ul>unordered list</ul>", "Date"),"<(.+?)>.+?</\\1>", simplify = TRUE)

m

##      [,1]                          
## [1,] "<H1> This is a header </H1>" 
## [2,] "<p> This is a paragraph </p>"
## [3,] "<date> 12/13/234</date>"     
## [4,] "<ul>unordered list</ul>"     
## [5,] ""

Bonus: Shinny R-flavored Regex Tester

Here is a link to a shiny regex tester that allows you to use R-flavored regular expressions. I used it to test my expressions.

https://adamspannbauer.github.io/2018/01/16/r-regex-tester-shiny-app/

9. Secret Message

The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com

code = "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8pf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"

There appears to be more lowercase characters than uppercase, so I will strip all the lower case characters and see what that yields:

secret_message <- str_replace_all(code,'[a-z\\s\\d]+','')
secret_message <- str_replace_all(secret_message,'[\\.]',' ')

THE SECRET MESSAGE IS: CONGRATULATIONS YOU ARE A SUPERNERD!.

DATA 607 - Assignment 3

Jim Mundy

3. Create Strings.

4. Types of Strings

9. Secret Message