library(stringr)
library(stringr)
library(XML)
library(maps)
## 
##  # maps v3.1: updated 'world': all lakes moved to separate new #
##  # 'lakes' database. Type '?world' or 'news(package="maps")'.  #
  1. Copy the introductory example. The vector name stores the extracted names.
#raw.data = input data
raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"

#Search for a capital letter followed by lowercase letters with two more minimal characters no min.
#extract names
name <- unlist(str_extract_all(raw.data, "[[A-Z][a-z]+., ]{2,}"))
name
## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"
  1. Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.
sp_name <-str_split(name,",")
sp_name
## [[1]]
## [1] "Moe Szyslak"
## 
## [[2]]
## [1] "Burns"          " C. Montgomery"
## 
## [[3]]
## [1] "Rev. Timothy Lovejoy"
## 
## [[4]]
## [1] "Ned Flanders"
## 
## [[5]]
## [1] "Simpson" " Homer" 
## 
## [[6]]
## [1] "Dr. Julius Hibbert"
for (i in 1:length(sp_name)){
  if (length(sp_name[[i]]) > 1) {
    temp <- sp_name[[i]][1]
    sp_name[[i]][1] <- sp_name[[i]][2]
    sp_name[[i]][2] <- temp
  }
}
sp_name
## [[1]]
## [1] "Moe Szyslak"
## 
## [[2]]
## [1] " C. Montgomery" "Burns"         
## 
## [[3]]
## [1] "Rev. Timothy Lovejoy"
## 
## [[4]]
## [1] "Ned Flanders"
## 
## [[5]]
## [1] " Homer"  "Simpson"
## 
## [[6]]
## [1] "Dr. Julius Hibbert"
  1. Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).
title_name <- str_detect(name,"Rev.|Dr.|Mr.|Ms.|Mrs.")
booltn <- data.frame (name, title_name)
booltn
##                   name title_name
## 1          Moe Szyslak      FALSE
## 2 Burns, C. Montgomery      FALSE
## 3 Rev. Timothy Lovejoy       TRUE
## 4         Ned Flanders      FALSE
## 5       Simpson, Homer      FALSE
## 6   Dr. Julius Hibbert       TRUE
  1. Construct a logical vector indicating whether a character has a second name.
middle_name <- str_detect(name," [A-Z]\\.")
boolmn <- data.frame (name, middle_name)
boolmn
##                   name middle_name
## 1          Moe Szyslak       FALSE
## 2 Burns, C. Montgomery        TRUE
## 3 Rev. Timothy Lovejoy       FALSE
## 4         Ned Flanders       FALSE
## 5       Simpson, Homer       FALSE
## 6   Dr. Julius Hibbert       FALSE
  1. Consider the string +++BREAKING NEWS+++ . We would like to extract the first HTML tag. To do so we write the regular expression <.+>. Explain why this fails and correct the expression.
tag_str <- "<title>+++BREAKING NEWS+++</title>"


wrong_tag <-str_extract(tag_str, "<.+>")
wrong_tag
## [1] "<title>+++BREAKING NEWS+++</title>"
#the regex
right_tag <-str_extract(tag_str, "<[[:alpha:]]+>")
right_tag
## [1] "<title>"
  1. Consider the string (5-3)2=52-253+3^2 conforms to the binomial theorem. We would like to extract the formula in the string. To do so we write the regular expression [^0-9=+*()]+. Explain why this fails and correct the expression.
binofo <- "(5-3)^2=5^2-2*5*3+3^2"


fix_form <-str_extract(binofo, "[^0-9=+*()]+")
fix_form
## [1] "-"
#the regex
new_form <-str_extract(binofo, "[\\^0-9-=+*()]+")
new_form
## [1] "(5-3)^2=5^2-2*5*3+3^2"
  1. The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com. clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr
tempstr <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"

secret <-unlist(str_extract_all(tempstr, "[[:upper:].!]"))

secret
##  [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "." "Y"
## [18] "O" "U" "." "A" "R" "E" "." "A" "." "S" "U" "P" "E" "R" "N" "E" "R"
## [35] "D" "!"