Question 1: The vector name stores the extracted names.

  1. Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.
library(tidyverse)
## -- Attaching packages ---------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1     v purrr   0.2.4
## v tibble  1.4.1     v dplyr   0.7.4
## v tidyr   0.7.2     v stringr 1.2.0
## v readr   1.1.1     v forcats 0.2.0
## -- Conflicts ------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(stringi)
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
raw.data
## [1] "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"
#extract words
original_names <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
#remove middle names - 
names01 <- str_replace(original_names, "\\s[A-z]\\. ", " ")
#restructure  first name last name
names02 <- str_replace(names01, "(\\w+),\\s(\\w+)", "\\2 \\1")
#remove title: 
new_names <- str_replace(names02, "[A-z]{2,3}\\. ", " ")
new_names
## [1] "Moe Szyslak"      "Montgomery Burns" " Timothy Lovejoy"
## [4] "Ned Flanders"     "Homer Simpson"    " Julius Hibbert"
original_names_df<- data.frame(original_names)
original_names_df
##         original_names
## 1          Moe Szyslak
## 2 Burns, C. Montgomery
## 3 Rev. Timothy Lovejoy
## 4         Ned Flanders
## 5       Simpson, Homer
## 6   Dr. Julius Hibbert
new_names_df <- data.frame(new_names)
new_names_df
##          new_names
## 1      Moe Szyslak
## 2 Montgomery Burns
## 3  Timothy Lovejoy
## 4     Ned Flanders
## 5    Homer Simpson
## 6   Julius Hibbert
  1. Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.)
title <- str_detect(names02,"[A-z]{2,3}\\. ")
title_df <- data.frame(names02, title)
title_df
##                names02 title
## 1          Moe Szyslak FALSE
## 2     Montgomery Burns FALSE
## 3 Rev. Timothy Lovejoy  TRUE
## 4         Ned Flanders FALSE
## 5        Homer Simpson FALSE
## 6   Dr. Julius Hibbert  TRUE
  1. Construct a logical vector indicating whether a character has a second name.
second_name <- str_detect(original_names, "[A-Z]{1}\\.")
sec_name_df <- data.frame(original_names, second_name)
sec_name_df
##         original_names second_name
## 1          Moe Szyslak       FALSE
## 2 Burns, C. Montgomery        TRUE
## 3 Rev. Timothy Lovejoy       FALSE
## 4         Ned Flanders       FALSE
## 5       Simpson, Homer       FALSE
## 6   Dr. Julius Hibbert       FALSE

Question 2: Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.

  1. [0-9]+$
reg_a <- "534435457$saf!@#3123$"
str_extract_all(reg_a, "[0-9]+\\$" )
## [[1]]
## [1] "534435457$" "3123$"
  1. \b[a-z]{1,4}\b
winter_olympics <- 'Winter Olympics started last week'
str_extract_all(winter_olympics, "\\b[a-z]{1,4}\\b")
## [[1]]
## [1] "last" "week"
  1. .*?.txt$
data_text <- c("Data607.txt", "Data.txt!", "data-science.pdf")
str_extract_all(data_text, ".*?\\.txt$")
## [[1]]
## [1] "Data607.txt"
## 
## [[2]]
## character(0)
## 
## [[3]]
## character(0)
  1. \d{2}/\d{2}/\d{4}
date_test <- c("01/01/18", "02/18/2018", "Feb 14, 2018")
str_extract_all(date_test, "\\d{2}/\\d{2}/\\d{4}")
## [[1]]
## character(0)
## 
## [[2]]
## [1] "02/18/2018"
## 
## [[3]]
## character(0)
  1. <(.+?)>.+?</\1>
tag_test <- c('<tag>this is a tag</tag>', '<data>data tag</data>', '<not a tag>','<DATA607> Data Acquistion with R</DATA607>')
str_extract_all(tag_test, "<(.+?)>.+?</\\1>")
## [[1]]
## [1] "<tag>this is a tag</tag>"
## 
## [[2]]
## [1] "<data>data tag</data>"
## 
## [[3]]
## character(0)
## 
## [[4]]
## [1] "<DATA607> Data Acquistion with R</DATA607>"

Question 3: The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com.

  1. clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk! gr
data.hidden <- 'clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk! gr'

secret_message <- unlist(str_extract_all(data.hidden, "[[:upper:].]+"))
secret_message
##  [1] "C"  "O"  "N"  "G"  "R"  "A"  "T"  "U"  "L"  "AT" "I"  "O"  "N"  "S" 
## [15] "."  "Y"  "O"  "U"  "."  "A"  "R"  "E"  "."  "A"  ".S" "U"  "P"  "E" 
## [29] "R"  "N"  "E"  "R"  "D"
decoded.message <- str_replace_all(paste0(secret_message, collapse = ''), "[.]", " ")
decoded.message
## [1] "CONGRATULATIONS YOU ARE A SUPERNERD"