library(stringr)
library(stringr)
library(XML)
library(maps)
##
## # maps v3.1: updated 'world': all lakes moved to separate new #
## # 'lakes' database. Type '?world' or 'news(package="maps")'. #
- Copy the introductory example. The vector name stores the extracted names.
#raw.data = input data
raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
#Search for a capital letter followed by lowercase letters with two more minimal characters no min.
#extract names
name <- unlist(str_extract_all(raw.data, "[[A-Z][a-z]+., ]{2,}"))
name
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
- Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.
sp_name <-str_split(name,",")
sp_name
## [[1]]
## [1] "Moe Szyslak"
##
## [[2]]
## [1] "Burns" " C. Montgomery"
##
## [[3]]
## [1] "Rev. Timothy Lovejoy"
##
## [[4]]
## [1] "Ned Flanders"
##
## [[5]]
## [1] "Simpson" " Homer"
##
## [[6]]
## [1] "Dr. Julius Hibbert"
for (i in 1:length(sp_name)){
if (length(sp_name[[i]]) > 1) {
temp <- sp_name[[i]][1]
sp_name[[i]][1] <- sp_name[[i]][2]
sp_name[[i]][2] <- temp
}
}
sp_name
## [[1]]
## [1] "Moe Szyslak"
##
## [[2]]
## [1] " C. Montgomery" "Burns"
##
## [[3]]
## [1] "Rev. Timothy Lovejoy"
##
## [[4]]
## [1] "Ned Flanders"
##
## [[5]]
## [1] " Homer" "Simpson"
##
## [[6]]
## [1] "Dr. Julius Hibbert"
- Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).
title_name <- str_detect(name,"Rev.|Dr.|Mr.|Ms.|Mrs.")
booltn <- data.frame (name, title_name)
booltn
## name title_name
## 1 Moe Szyslak FALSE
## 2 Burns, C. Montgomery FALSE
## 3 Rev. Timothy Lovejoy TRUE
## 4 Ned Flanders FALSE
## 5 Simpson, Homer FALSE
## 6 Dr. Julius Hibbert TRUE
- Construct a logical vector indicating whether a character has a second name.
middle_name <- str_detect(name," [A-Z]\\.")
boolmn <- data.frame (name, middle_name)
boolmn
## name middle_name
## 1 Moe Szyslak FALSE
## 2 Burns, C. Montgomery TRUE
## 3 Rev. Timothy Lovejoy FALSE
## 4 Ned Flanders FALSE
## 5 Simpson, Homer FALSE
## 6 Dr. Julius Hibbert FALSE
- Consider the string
+++BREAKING NEWS+++
. We would like to extract the first HTML tag. To do so we write the regular expression <.+>. Explain why this fails and correct the expression.
tag_str <- "<title>+++BREAKING NEWS+++</title>"
wrong_tag <-str_extract(tag_str, "<.+>")
wrong_tag
## [1] "<title>+++BREAKING NEWS+++</title>"
#the regex
right_tag <-str_extract(tag_str, "<[[:alpha:]]+>")
right_tag
## [1] "<title>"
- Consider the string (5-3)2=52-253+3^2 conforms to the binomial theorem. We would like to extract the formula in the string. To do so we write the regular expression [^0-9=+*()]+. Explain why this fails and correct the expression.
binofo <- "(5-3)^2=5^2-2*5*3+3^2"
fix_form <-str_extract(binofo, "[^0-9=+*()]+")
fix_form
## [1] "-"
#the regex
new_form <-str_extract(binofo, "[\\^0-9-=+*()]+")
new_form
## [1] "(5-3)^2=5^2-2*5*3+3^2"
- The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com. clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr
tempstr <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
secret <-unlist(str_extract_all(tempstr, "[[:upper:].!]"))
secret
## [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "." "Y"
## [18] "O" "U" "." "A" "R" "E" "." "A" "." "S" "U" "P" "E" "R" "N" "E" "R"
## [35] "D" "!"