Chapter 8: Questions #3 Copy the introductory example. The vector name stores the extracted names.
library(stringr)
# First let's create a variable which will store the raw data
raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
#Let's extract the information
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
Q #3a. Use the tools of this chapter to rearrange the vector so that all elements conform to the standard: first_name last_name.
name2 <- str_replace_all(name, "(.+)(, .+)$", "\\2 \\1")
name2
## [1] "Moe Szyslak" ", C. Montgomery Burns" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" ", Homer Simpson" "Dr. Julius Hibbert"
name3 <- str_replace_all(name2, ", ", "")
name3
## [1] "Moe Szyslak" "C. Montgomery Burns" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Homer Simpson" "Dr. Julius Hibbert"
name4 <- str_replace_all(name3, "[A-Z][a-z]([a-z]?)\\.", "")
name4
## [1] "Moe Szyslak" "C. Montgomery Burns" " Timothy Lovejoy"
## [4] "Ned Flanders" "Homer Simpson" " Julius Hibbert"
Q #3b. Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.)
Rev_title <- str_detect(name, "Rev. ")
Dr_title <- str_detect(name, "Dr. ")
df <- data.frame(name,Rev_title, Dr_title )
df
## name Rev_title Dr_title
## 1 Moe Szyslak FALSE FALSE
## 2 Burns, C. Montgomery FALSE FALSE
## 3 Rev. Timothy Lovejoy TRUE FALSE
## 4 Ned Flanders FALSE FALSE
## 5 Simpson, Homer FALSE FALSE
## 6 Dr. Julius Hibbert FALSE TRUE
Q #3c. Construct a logical vector indicating whether a character has a second name
df$SecondName <- str_detect(string = name3, pattern = "[A-Z]{1}\\.")
df
## name Rev_title Dr_title SecondName
## 1 Moe Szyslak FALSE FALSE FALSE
## 2 Burns, C. Montgomery FALSE FALSE TRUE
## 3 Rev. Timothy Lovejoy TRUE FALSE FALSE
## 4 Ned Flanders FALSE FALSE FALSE
## 5 Simpson, Homer FALSE FALSE FALSE
## 6 Dr. Julius Hibbert FALSE TRUE FALSE
Q #4. Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression
Q #4a. “[0-9]+\$”
statement_1 = "[0-9]+\\$"
test_1 <- c("7777$", "98$", "123231231")
str_detect(test_1,statement_1)
## [1] TRUE TRUE FALSE
#Dollar sign followed by a string of numbers
Q #4b. “\b[a-z]{1,4}\b”
statement_2 <- "\\b[a-z]{1,4}\\b"
test_2<- c("data", "science", "is", "cool")
str_detect(test_2, statement_2)
## [1] TRUE FALSE TRUE TRUE
#Words consisting of 1 to 4 letters in length
Q #4c. “.*?\.txt$"
statement_3 <- ".*?\\.txt$"
test_3 <- c("file.pdf","file.txt","file.dmg","homework.jpg")
str_detect(test_3, statement_3)
## [1] FALSE TRUE FALSE FALSE
#expression_3 represents a string of characters followed by a .txt
Q #4d. “\d{2}/\d{2}/\d{4}”
statement_4<-"\\d{2}/\\d{2}/\\d{4}"
test_4 <-c("11/15/2012", "07/23/2018", "01/01/2006")
str_detect(test_4, statement_4)
## [1] TRUE TRUE TRUE
# Expression of numbers xx/xx/xxxx. Example would be for dates, mm/dd/yyyy
Q #4e. “<(.+?)>.+?</\1>”
statement_5 <- "<(.+?)>.+?</\\1>"
test_5 <- c("<hello>hi</hello>", "<hi>hello</hello>")
str_detect(test_5, statement_5)
## [1] TRUE FALSE
#This regular expression is similar to HTML. It will essentially extract any text within HTML tags provided the tags are of HTML type
Q #9. The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com
secret_message <- paste("clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr")
regex = "[[:upper:].]+"
str_replace_all(paste(unlist(str_extract_all(secret_message, regex)),collapse=""),pattern="[\\.]+",replacement=" ")
## [1] "CONGRATULATIONS YOU ARE A SUPERNERD"
#Thanks for the compliment :) Appreciate it.