The Task
Please deliver links to an R Markdown file (in GitHub and rpubs.com) with solutions to problems 3 and 4 from chapter 8 of Automated Data Collection in R. Problem 9 is extra credit.
Load Packages
knitr::opts_chunk$set(#echo=FALSE,
warning=FALSE,
message=FALSE,
tidy=TRUE,
#comment = "",
dev="png",
dev.args=list(type="cairo"))
#https://cran.r-project.org/web/packages/prettydoc/vignettes/architect.html
#https://www.rstudio.com/wp-content/uploads/2015/03/rmarkdown-reference.pdf
load.packages <- c("RCurl", "knitr","stringr","htmlTable", "tesseract","prettydoc")
ipak <- function(pkg){
#FUNCTION SOURCE: https://gist.github.com/stevenworthington/3178163
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
if (length(new.pkg))
install.packages(new.pkg, dependencies = TRUE)
sapply(pkg, require, character.only = TRUE)
}
ipak(load.packages)## Loading required package: RCurl
## Loading required package: bitops
## Loading required package: knitr
## Loading required package: stringr
## Loading required package: htmlTable
## Loading required package: tesseract
## Loading required package: prettydoc
# html_document: toc: true code_folding: hideJust for fun, I used the optical character recognition package (tesseract) to get the text off of the textbook image. I wanted to see if it could be done. It worked for more than half of the text.
# text <-
# ocr('https://raw.githubusercontent.com/kylegilde/D607-Data-Acquistion/master/week3_problems.jpg')
# my.list <- unlist(strsplit(text, '\n'))Problem 3
a. Use the tools of this chapter to rearrange the Vector so that all elements conform to the standard First_name last_name.
# sub('fopy','Copy', my.list[1]) cat(my.list[5], sub('Jv','_n', my.list[6]))
raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
names <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
names_df <- data.frame(names, stringsAsFactors = FALSE)
names_df$first_half <- str_trim(str_replace(str_extract(names_df$names, "(, ).+"),
",", ""))
names_df$last_half <- str_trim(str_replace(str_extract(names_df$names, ".+(, )"),
",", ""))
names_df$first_last_name <- ifelse(is.na(names_df$first_half), names_df$names,
str_c(names_df$first_half, " ", names_df$last_half))
kable(names_df[, c(1, 4)])| names | first_last_name |
|---|---|
| Moe Szyslak | Moe Szyslak |
| Burns, C. Montgomery | C. Montgomery Burns |
| Rev. Timothy Lovejoy | Rev. Timothy Lovejoy |
| Ned Flanders | Ned Flanders |
| Simpson, Homer | Homer Simpson |
| Dr. Julius Hibbert | Dr. Julius Hibbert |
b. Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).
# str_replace(my.list[7],'Lew','i.e.')
regex1 <- "[[:alpha:]]{2,3}\\."
names_df$has_title <- str_detect(names_df$first_last_name, regex1)
kable(names_df[, c(4, 5)])| first_last_name | has_title |
|---|---|
| Moe Szyslak | FALSE |
| C. Montgomery Burns | FALSE |
| Rev. Timothy Lovejoy | TRUE |
| Ned Flanders | FALSE |
| Homer Simpson | FALSE |
| Dr. Julius Hibbert | TRUE |
c. Construct a logical vector indicating whether a character has a second name.
# my.list[8]
regex2 <- "^[[:alpha:]]\\.\\s"
names_df$has_2nd_name <- str_detect(names_df$first_last_name, regex2)
kable(names_df[, c(4, 6)])| first_last_name | has_2nd_name |
|---|---|
| Moe Szyslak | FALSE |
| C. Montgomery Burns | TRUE |
| Rev. Timothy Lovejoy | FALSE |
| Ned Flanders | FALSE |
| Homer Simpson | FALSE |
| Dr. Julius Hibbert | FALSE |
Problem 4
Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression."
a. [0-9]+\$
numeric characters one or more times, followed by a dollar sign
# my.list[9:15]
a <- "[0-9]+\\$"
a.ex <- "3425452345adsfas12351$asdfasdf"
str_extract(a.ex, a)## [1] "12351$"
b. \b[a-z]{1,4}\b
strings composed of between 1 and 4 lowercase letters that also are word edges at both ends
b <- "\\b[a-z]{1,4}\\b"
b.ex <- " adsfas 54 cs asdff dae4 asds"
str_extract_all(b.ex, b)## [[1]]
## [1] "cs" "asds"
c. .*?\.txt$
strings that have any character zero or more times and that end with .txt
c <- ".*?\\.txt$"
c.ex <- "34w54%&$YTAas54.txt"
str_extract_all(c.ex, c)## [[1]]
## [1] "34w54%&$YTAas54.txt"
d. \d{2}/\d{2}/\d{4}
strings that have exactly 2 digits, followed by a forward slash, then exactly 2 digits and a forward slash, followed by exactly 4 digits
d <- "\\d{2}/\\d{2}/\\d{4}"
d.ex <- "1/1/2011 12-05-2025sas22/43/520033551"
str_extract_all(d.ex, d)## [[1]]
## [1] "22/43/5200"
e. <(.+?)>.+?</\1>
HTML start tag w/o attributes, its content and end tag
e <- "<(.+?)>.+?</\\1>"
e.ex <- "wg45qa4d<title> 12-05-2025sas22/43/520033551</title>dfasdfadsf"
str_extract_all(e.ex, e)## [[1]]
## [1] "<title> 12-05-2025sas22/43/520033551</title>"
Problem 9
One following code hides a secret message, Crack it whh R and regular expressions. Hint: Some of the characters are more revealing than others
jibberish <- getURL("http://www.r-datacollection.com/materials/regex/code_exercise.txt")
answer <- unlist(str_extract_all(jibberish, "[A-Z.]"))
jibberish## [1] "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr\n"
cat(str_replace(answer, "\\.", " "))## C O N G R A T U L A T I O N S Y O U A R E A S U P E R N E R D