Libraries:
library(stringr)
library(tidyverse)
library(kableExtra)
library(knitr)
library(plyr)
3.) Copy the introductory example. The vector name stores the extracted names.
- Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.
- Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).
- Construct a logical vector indicating whether a character has a second name.
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
(a) Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.
#pull out words
original_names <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
#remove middle names - detect a space, letter, and a '.', then replace with blank
names1 <- str_replace(original_names, "\\s[A-z]\\. ", " ")
#rearrange to first name last name - find word groups then swap them
names2 <- str_replace(names1, "(\\w+),\\s(\\w+)", "\\2 \\1")
#remove title: search for a-z{2-3letters} followed by a '.', then replace with blank
new_names <- str_replace(names2, "[A-z]{2,3}\\. ", " ")
originaln <- data.frame(original_names)
newn <- data.frame(new_names)
kable(list(originaln, newn), "html", escape = F) %>%
kable_styling("striped", full_width = T) %>%
column_spec(1, bold = T) %>%
row_spec(1, bold = T)
|
|
(b) Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).
title <- str_detect(names2,"[A-z]{2,3}\\. ")
df1 <- data.frame(names2, title)
kable(df1, "html", escape = F) %>%
kable_styling("striped", full_width = T)
names2 | title |
---|---|
Moe Szyslak | FALSE |
Montgomery Burns | FALSE |
Rev. Timothy Lovejoy | TRUE |
Ned Flanders | FALSE |
Homer Simpson | FALSE |
Dr. Julius Hibbert | TRUE |
(c) Construct a logical vector indicating whether a character has a second name.
second_name <- str_detect(original_names, "[A-Z]{1}\\.")
df2 <- data.frame(original_names, second_name)
kable(df2, "html", escape = F) %>%
kable_styling("striped", full_width = T)
original_names | second_name |
---|---|
Moe Szyslak | FALSE |
Burns, C. Montgomery | TRUE |
Rev. Timothy Lovejoy | FALSE |
Ned Flanders | FALSE |
Simpson, Homer | FALSE |
Dr. Julius Hibbert | FALSE |
4.) Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.
- [0-9]+\$
- \b[a-z]{1,4}\b
- .*?\.txt$
- \d{2}/\d{2}/\d{4}
- <(.+?)>.+?</\1>
(a) [0-9]+\$ [0-9]select digits 0-9, + - one or more times, $
reg_a <- "534435457$saf!@#3123$"
str_extract_all(reg_a, "[0-9]+\\$" )
## [[1]]
## [1] "534435457$" "3123$"
(b) \b[a-z]{1,4}\ //b Word Edge, [a-z] letters a-z lowercase, {1,4} min,max chars
reg_b <- "nick hale y amanda nic khaleyamanda HI"
str_extract_all(reg_b, "\\b[a-z]{1,4}\\b")
## [[1]]
## [1] "nick" "hale" "y" "nic"
(c) .*?\.txt$ Pulls out files: . any character except line break, /* zero or more times, //. literal ., $ end of line
reg_c <- "c:/local/nic/test.txt"
str_extract_all(reg_c, ".*?\\.txt$")
## [[1]]
## [1] "c:/local/nic/test.txt"
(d) \d{2}/\d{2}/\d{4} Pulls out dates: //d any digit, {2} 2 characters, / a ‘/’
rec_d <- " 02/10/2018"
str_extract_all(rec_d, "\\d{2}/\\d{2}/\\d{4}")
## [[1]]
## [1] "02/10/2018"
(e) <(.+?)>.+?</\1> Pulls out html code:
reg_e <- "<p>this is a paragraph</p>"
str_extract_all(reg_e, "<(.+?)>.+?</\\1>")
## [[1]]
## [1] "<p>this is a paragraph</p>"
9.) The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others!
mystery <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
#pull out all capital letters and periods
mystery1 <- unlist(str_extract_all(mystery, "[[A-Z].]"))
#Join the letters together, remove the spaces
mystery1 <- paste(mystery1, collapse = "")
#replaces the '.'s with a space
str_replace_all(mystery1, "[.]", " ")
## [1] "CONGRATULATIONS YOU ARE A SUPERNERD"