Chapter 8- Automated Data Collection with R

Chapter 8: Questions #3 Copy the introductory example. The vector name stores the extracted names.

library(stringr)
# First let's create a variable which will store the raw data

raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"

#Let's extract the information
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))

name
## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

Q #3a. Use the tools of this chapter to rearrange the vector so that all elements conform to the standard: first_name last_name.

name2 <- str_replace_all(name, "(.+)(, .+)$", "\\2 \\1") 
name2 
## [1] "Moe Szyslak"           ", C. Montgomery Burns" "Rev. Timothy Lovejoy" 
## [4] "Ned Flanders"          ", Homer Simpson"       "Dr. Julius Hibbert"
name3 <- str_replace_all(name2, ", ", "") 
name3
## [1] "Moe Szyslak"          "C. Montgomery Burns"  "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Homer Simpson"        "Dr. Julius Hibbert"
name4 <- str_replace_all(name3, "[A-Z][a-z]([a-z]?)\\.", "") 
name4
## [1] "Moe Szyslak"         "C. Montgomery Burns" " Timothy Lovejoy"   
## [4] "Ned Flanders"        "Homer Simpson"       " Julius Hibbert"

Q #3b. Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.)

Rev_title <- str_detect(name, "Rev. ")
Dr_title <- str_detect(name, "Dr. ")
df <- data.frame(name,Rev_title, Dr_title )
df
##                   name Rev_title Dr_title
## 1          Moe Szyslak     FALSE    FALSE
## 2 Burns, C. Montgomery     FALSE    FALSE
## 3 Rev. Timothy Lovejoy      TRUE    FALSE
## 4         Ned Flanders     FALSE    FALSE
## 5       Simpson, Homer     FALSE    FALSE
## 6   Dr. Julius Hibbert     FALSE     TRUE

Q #3c. Construct a logical vector indicating whether a character has a second name

df$SecondName <- str_detect(string = name3, pattern = "[A-Z]{1}\\.")
df
##                   name Rev_title Dr_title SecondName
## 1          Moe Szyslak     FALSE    FALSE      FALSE
## 2 Burns, C. Montgomery     FALSE    FALSE       TRUE
## 3 Rev. Timothy Lovejoy      TRUE    FALSE      FALSE
## 4         Ned Flanders     FALSE    FALSE      FALSE
## 5       Simpson, Homer     FALSE    FALSE      FALSE
## 6   Dr. Julius Hibbert     FALSE     TRUE      FALSE

Q #4. Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression

Q #4a. “[0-9]+\$”

statement_1 = "[0-9]+\\$"
test_1 <- c("7777$", "98$", "123231231")
str_detect(test_1,statement_1)
## [1]  TRUE  TRUE FALSE
#Dollar sign followed by a string of numbers

Q #4b. “\b[a-z]{1,4}\b”

statement_2 <- "\\b[a-z]{1,4}\\b"
test_2<- c("data", "science", "is", "cool")
str_detect(test_2, statement_2)
## [1]  TRUE FALSE  TRUE  TRUE
#Words consisting of 1 to 4 letters in length

Q #4c. “.*?\.txt$"

statement_3 <- ".*?\\.txt$"

test_3 <- c("file.pdf","file.txt","file.dmg","homework.jpg")

str_detect(test_3, statement_3)
## [1] FALSE  TRUE FALSE FALSE
#expression_3 represents a string of characters followed by a .txt 

Q #4d. “\d{2}/\d{2}/\d{4}”

statement_4<-"\\d{2}/\\d{2}/\\d{4}"

test_4 <-c("11/15/2012", "07/23/2018", "01/01/2006")
str_detect(test_4, statement_4)
## [1] TRUE TRUE TRUE
# Expression of numbers xx/xx/xxxx. Example would be for dates, mm/dd/yyyy

Q #4e. “<(.+?)>.+?</\1>”

statement_5 <- "<(.+?)>.+?</\\1>"
test_5 <- c("<hello>hi</hello>", "<hi>hello</hello>")
str_detect(test_5, statement_5)
## [1]  TRUE FALSE
#This regular expression is similar to HTML. It will essentially extract any text within HTML tags provided the tags are of HTML type

Q #9. The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com

secret_message <- paste("clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr")

regex = "[[:upper:].]+"

str_replace_all(paste(unlist(str_extract_all(secret_message, regex)),collapse=""),pattern="[\\.]+",replacement=" ")
## [1] "CONGRATULATIONS YOU ARE A SUPERNERD"
#Thanks for the compliment :)  Appreciate it.