Assignment 3

Please deliver links to an R Markdown file (in GitHub and rpubs.com) with solutions to problems 3 and 4 from chapter 8 of Automated Data Collection in R. Problem 9 is extra credit. You may work in a small group, but please submit separately with names of all group participants in your submission.

Here is the referenced code for the introductory example in #3:

library(stringr)
library(XML)

3

raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5543642Dr. Julius Hibbert"

3.1 Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name

name <- unlist(str_extract_all(raw.data, "[[A-z]., ]{2,}"))
name

## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

#we can also use:
#name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
#name

#what's the first name
firstName <- unlist(str_extract_all(name, "[.,] [[A-z]]{2,}$|[[A-z]]{2,} "))
firstName <- unlist(str_extract_all(firstName, "[[A-z]]{2,}"))
firstName

## [1] "Moe"        "Montgomery" "Timothy"    "Ned"        "Homer"     
## [6] "Julius"

#what's the last name
lastName <- unlist(str_extract_all(name, "[^[.,]] [[A-z]]{2,}$|[[A-z]]{2,}, "))
lastName <- unlist(str_extract_all(lastName, "[[A-z]]{2,}"))
lastName

## [1] "Szyslak"  "Burns"    "Lovejoy"  "Flanders" "Simpson"  "Hibbert"

#construct the data frame by combining the  fistname and lastname columns
data.frame("Fist Name" = firstName, "Last Name" = lastName)

##    Fist.Name Last.Name
## 1        Moe   Szyslak
## 2 Montgomery     Burns
## 3    Timothy   Lovejoy
## 4        Ned  Flanders
## 5      Homer   Simpson
## 6     Julius   Hibbert

3.2 Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.)

myTitles <- unlist(str_extract_all(name, "[[A-z]]{2,}\\."))
myTitles

## [1] "Rev." "Dr."

data.frame("Name" = name, "hasTitle" = str_detect(name, myTitles))

##                   Name hasTitle
## 1          Moe Szyslak    FALSE
## 2 Burns, C. Montgomery    FALSE
## 3 Rev. Timothy Lovejoy     TRUE
## 4         Ned Flanders    FALSE
## 5       Simpson, Homer    FALSE
## 6   Dr. Julius Hibbert     TRUE

3.3 Construct a logical vector indicating whether a character has a second name

secondName <- unlist(str_extract_all(name, " [[A-z]]{1}\\.? [[A-z]]{1,}\\.?"))
secondName <- unlist(str_extract_all(secondName,"[[A-z]]{1}\\.? [[A-z]]{1,}\\.?"))
secondName

## [1] "C. Montgomery"

data.frame("Name" = name, "hasSecondName" = str_detect(name, secondName))

##                   Name hasSecondName
## 1          Moe Szyslak         FALSE
## 2 Burns, C. Montgomery          TRUE
## 3 Rev. Timothy Lovejoy         FALSE
## 4         Ned Flanders         FALSE
## 5       Simpson, Homer         FALSE
## 6   Dr. Julius Hibbert         FALSE

4

Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression

4.1

# [0-9]+\\$ : one or more digits followed by a $ sign.
pattern1 <- "[0-9]+\\$"
mydata1 <- c("246$", "97$", "4567$", "0$$")
str_detect(mydata1, pattern1)

## [1] TRUE TRUE TRUE TRUE

4.2

# \\b[a-z]{1,4}\\b : one or more, up to 4 lower case letters from a to z. \b is for word edge and helps in specifying the location of the match.
pattern2 <- "\\b[a-z]{1,4}\\b"
mydata2 <- c("data", "is", "not", "far", "from", "here")
str_detect(mydata2, pattern2)

## [1] TRUE TRUE TRUE TRUE TRUE TRUE

4.3

# .*?\\.txt$ : finds patterns that end with .txt 
pattern3 <- ".*?\\.txt$"
mydata3 <- c("data.txt", "file123.txt", "hellow.1.abc.234.txt", "000ZORO!999.txt")
str_detect(mydata3, pattern3)

## [1] TRUE TRUE TRUE TRUE

4.4

# \\d{2}/\\d{2}/\\d{4} : contains date with format mm/dd/yyyy (like in U.S.) or dd/mm/yyyy (like in Europe)
pattern4 <- "\\d{2}/\\d{2}/\\d{4}"
mydata4 <- c("09/14/2019", "14/09/2019", "my kid was born in 03/10/2006")
str_detect(mydata4, pattern4)

## [1] TRUE TRUE TRUE

4.5

# <(.+?)>.+?</\\1> : finds tags in an html document
pattern5 <- "<(.+?)>.+?</\\1>"
mydata5 <- c("<div><h1>Data 607</h1><p>This class Data607 is awesome. Learning new things every week.</p></div>", "<html><body>Hello</body></html>", "CUNY is <bold>Cool</bold>.")
str_detect(mydata5, pattern5)

## [1] TRUE TRUE TRUE

9

Crack the secret Message

message <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"

message

## [1] "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"

#let's remove the lower case letters from the message
messageWithoutLowerCases <- unlist(str_extract_all(message, "[[:upper:].!]"))

messageWithoutLowerCases

##  [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "." "Y"
## [18] "O" "U" "." "A" "R" "E" "." "A" "." "S" "U" "P" "E" "R" "N" "E" "R"
## [35] "D" "!"

#Put those characters together
together = paste(messageWithoutLowerCases, collapse="")

together

## [1] "CONGRATULATIONS.YOU.ARE.A.SUPERNERD!"

#then replace the . with space everywhere
theSecret <- c( str_replace_all(together, "[\\.]",  "  "))

theSecret

## [1] "CONGRATULATIONS  YOU  ARE  A  SUPERNERD!"

Assignment 3

Abdelmalek Hajjam

9/14/2019

3

4

9