Please deliver links to an R Markdown file (in GitHub and rpubs.com) with solutions to problems 3 and 4 from chapter 8 of Automated Data Collection in R. Problem 9 is extra credit.

library(stringr)

raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer555-3642Dr. Julius Hibbert"
raw.data

## [1] "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer555-3642Dr. Julius Hibbert"

# First_name and Last_name.
names <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}")) 
names

## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

#Phone numbers
phone <- unlist(str_extract_all(raw.data,"\\(?(\\d{3})?\\)?(-| )?\\d{3}(-| )?\\d{4}"))
phone

## [1] "555-1239"       "(636) 555-0113" "555-6542"       "555 8904"      
## [5] "636-555-3226"   "555-3642"

# Create a data frame
df <- data.frame(names=names, phone=phone)
df

##                  names          phone
## 1          Moe Szyslak       555-1239
## 2 Burns, C. Montgomery (636) 555-0113
## 3 Rev. Timothy Lovejoy       555-6542
## 4         Ned Flanders       555 8904
## 5       Simpson, Homer   636-555-3226
## 6   Dr. Julius Hibbert       555-3642

Use the tools of this chapter to rearrange the vector so that all elements conform to the standard

names <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}")) 
names

## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).

title <- str_detect(names, "Rev.|Dr.")
title

## [1] FALSE FALSE  TRUE FALSE FALSE  TRUE

Construct a logical vector indicating whether a character has a second name.

second_name <- str_detect(names," [A-Z]\\.")
second_name

## [1] FALSE  TRUE FALSE FALSE FALSE FALSE

Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.

[0-9]+\$ Matches any number of digits followed by $

str_extract("vbftghr234568$", "[0-9]+\\$")

## [1] "234568$"

\b[a-z]{1,4}\b Matches any string with lower case of up to 4 characters

string <- c("rtyu", "ty", "r", "1234")
str_extract(string, "\\b[a-z]{1,4}\\b")

## [1] "rtyu" "ty"   "r"    NA

.*?\.txt$ Can be used to return any string ending in txt

string <- c("str.txt", "xyz.csv", ".txt")
str_extract(string, ".*?\\.txt$")

## [1] "str.txt" NA        ".txt"

\d{2}/\d{2}/\d{4} Can be used to return number in date format

string <- c("09122019", "9/12/2019", "09/12/2019")
str_extract(string, "\\d{2}/\\d{2}/\\d{4}")

## [1] NA           NA           "09/12/2019"

<(.+?)>.+?</\1> Can be used to return HMTL/XML tag - string

string <-c("<title>document</title>", "<title>document<title>", "<title>document")
str_extract(string, "<(.+?)>.+?</\\1>")

## [1] "<title>document</title>" NA                       
## [3] NA

BONUS Question (9)

The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com.

raw <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo
Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO
d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5
fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"

Modifying the vector

separate <- paste(str_replace(unlist(str_extract_all(raw, "[[:upper:].!]")), "[.]", " "))
separate_adv <- str_c(separate, collapse = "")

Displaying Final Result

noquote(separate_adv)

## [1] CONGRATULATIONS YOU ARE A SUPERNERD!

END

Data 607-Homework3_Reg Exp

Emmanuel Hayble-Gomes

9/12/2019

BONUS Question (9)