The Task

Load Packages

knitr::opts_chunk$set(#echo=FALSE, 
                      warning=FALSE, 
                      message=FALSE,
                      tidy=TRUE,
                      #comment = "",
                      dev="png", 
                      dev.args=list(type="cairo"))

#https://cran.r-project.org/web/packages/prettydoc/vignettes/architect.html
#https://www.rstudio.com/wp-content/uploads/2015/03/rmarkdown-reference.pdf

load.packages <- c("RCurl", "knitr","stringr","htmlTable", "tesseract","prettydoc")


ipak <- function(pkg){
    #FUNCTION SOURCE: https://gist.github.com/stevenworthington/3178163
    new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
    if (length(new.pkg)) 
        install.packages(new.pkg, dependencies = TRUE)
    sapply(pkg, require, character.only = TRUE)
}
ipak(load.packages)
## Loading required package: RCurl
## Loading required package: bitops
## Loading required package: knitr
## Loading required package: stringr
## Loading required package: htmlTable
## Loading required package: tesseract
## Loading required package: prettydoc
# html_document: toc: true code_folding: hide

Just for fun, I used the optical character recognition package (tesseract) to get the text off of the textbook image. I wanted to see if it could be done. It worked for more than half of the text.

# text <-
# ocr('https://raw.githubusercontent.com/kylegilde/D607-Data-Acquistion/master/week3_problems.jpg')
# my.list <- unlist(strsplit(text, '\n'))

Problem 3

a. Use the tools of this chapter to rearrange the Vector so that all elements conform to the standard First_name last_name.

# sub('fopy','Copy', my.list[1]) cat(my.list[5], sub('Jv','_n', my.list[6]))

raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"

names <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
names_df <- data.frame(names, stringsAsFactors = FALSE)
names_df$first_half <- str_trim(str_replace(str_extract(names_df$names, "(, ).+"), 
    ",", ""))
names_df$last_half <- str_trim(str_replace(str_extract(names_df$names, ".+(, )"), 
    ",", ""))
names_df$first_last_name <- ifelse(is.na(names_df$first_half), names_df$names, 
    str_c(names_df$first_half, " ", names_df$last_half))
kable(names_df[, c(1, 4)])
names first_last_name
Moe Szyslak Moe Szyslak
Burns, C. Montgomery C. Montgomery Burns
Rev. Timothy Lovejoy Rev. Timothy Lovejoy
Ned Flanders Ned Flanders
Simpson, Homer Homer Simpson
Dr. Julius Hibbert Dr. Julius Hibbert

b. Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).

# str_replace(my.list[7],'Lew','i.e.')
regex1 <- "[[:alpha:]]{2,3}\\."

names_df$has_title <- str_detect(names_df$first_last_name, regex1)
kable(names_df[, c(4, 5)])
first_last_name has_title
Moe Szyslak FALSE
C. Montgomery Burns FALSE
Rev. Timothy Lovejoy TRUE
Ned Flanders FALSE
Homer Simpson FALSE
Dr. Julius Hibbert TRUE

c. Construct a logical vector indicating whether a character has a second name.

# my.list[8]
regex2 <- "^[[:alpha:]]\\.\\s"

names_df$has_2nd_name <- str_detect(names_df$first_last_name, regex2)
kable(names_df[, c(4, 6)])
first_last_name has_2nd_name
Moe Szyslak FALSE
C. Montgomery Burns TRUE
Rev. Timothy Lovejoy FALSE
Ned Flanders FALSE
Homer Simpson FALSE
Dr. Julius Hibbert FALSE

Problem 4

Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression."

a. [0-9]+\$

numeric characters one or more times, followed by a dollar sign

# my.list[9:15]
a <- "[0-9]+\\$"
a.ex <- "3425452345adsfas12351$asdfasdf"
str_extract(a.ex, a)
## [1] "12351$"

b. \b[a-z]{1,4}\b

strings composed of between 1 and 4 lowercase letters that also are word edges at both ends

b <- "\\b[a-z]{1,4}\\b"
b.ex <- " adsfas 54 cs asdff dae4 asds"
str_extract_all(b.ex, b)
## [[1]]
## [1] "cs"   "asds"

c. .*?\.txt$
strings that have any character zero or more times and that end with .txt

c <- ".*?\\.txt$"
c.ex <- "34w54%&$YTAas54.txt"
str_extract_all(c.ex, c)
## [[1]]
## [1] "34w54%&$YTAas54.txt"

d. \d{2}/\d{2}/\d{4}

strings that have exactly 2 digits, followed by a forward slash, then exactly 2 digits and a forward slash, followed by exactly 4 digits

d <- "\\d{2}/\\d{2}/\\d{4}"
d.ex <- "1/1/2011 12-05-2025sas22/43/520033551"
str_extract_all(d.ex, d)
## [[1]]
## [1] "22/43/5200"

e. <(.+?)>.+?</\1>

HTML start tag w/o attributes, its content and end tag

e <- "<(.+?)>.+?</\\1>"
e.ex <- "wg45qa4d<title> 12-05-2025sas22/43/520033551</title>dfasdfadsf"
str_extract_all(e.ex, e)
## [[1]]
## [1] "<title> 12-05-2025sas22/43/520033551</title>"

Problem 9

One following code hides a secret message, Crack it whh R and regular expressions. Hint: Some of the characters are more revealing than others

jibberish <- getURL("http://www.r-datacollection.com/materials/regex/code_exercise.txt")
answer <- unlist(str_extract_all(jibberish, "[A-Z.]"))
jibberish
## [1] "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr\n"
cat(str_replace(answer, "\\.", " "))
## C O N G R A T U L A T I O N S   Y O U   A R E   A   S U P E R N E R D