The Task

Please deliver links to an R Markdown file (in GitHub and rpubs.com) with solutions to problems 3 and 4 from chapter 8 of Automated Data Collection in R. Problem 9 is extra credit.

Load Packages

knitr::opts_chunk$set(#echo=FALSE, 
                      warning=FALSE, 
                      message=FALSE,
                      tidy=TRUE,
                      #comment = "",
                      dev="png", 
                      dev.args=list(type="cairo"))

#https://cran.r-project.org/web/packages/prettydoc/vignettes/architect.html
#https://www.rstudio.com/wp-content/uploads/2015/03/rmarkdown-reference.pdf

load.packages <- c("RCurl", "knitr","stringr","htmlTable", "tesseract","prettydoc")


ipak <- function(pkg){
    #FUNCTION SOURCE: https://gist.github.com/stevenworthington/3178163
    new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
    if (length(new.pkg)) 
        install.packages(new.pkg, dependencies = TRUE)
    sapply(pkg, require, character.only = TRUE)
}
ipak(load.packages)

## Loading required package: RCurl

## Loading required package: bitops

## Loading required package: knitr

## Loading required package: stringr

## Loading required package: htmlTable

## Loading required package: tesseract

## Loading required package: prettydoc

# html_document: toc: true code_folding: hide

Just for fun, I used the optical character recognition package (tesseract) to get the text off of the textbook image. I wanted to see if it could be done. It worked for more than half of the text.

# text <-
# ocr('https://raw.githubusercontent.com/kylegilde/D607-Data-Acquistion/master/week3_problems.jpg')
# my.list <- unlist(strsplit(text, '\n'))

Problem 3

a. Use the tools of this chapter to rearrange the Vector so that all elements conform to the standard First_name last_name.

# sub('fopy','Copy', my.list[1]) cat(my.list[5], sub('Jv','_n', my.list[6]))

raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"

names <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
names_df <- data.frame(names, stringsAsFactors = FALSE)
names_df$first_half <- str_trim(str_replace(str_extract(names_df$names, "(, ).+"), 
    ",", ""))
names_df$last_half <- str_trim(str_replace(str_extract(names_df$names, ".+(, )"), 
    ",", ""))
names_df$first_last_name <- ifelse(is.na(names_df$first_half), names_df$names, 
    str_c(names_df$first_half, " ", names_df$last_half))
kable(names_df[, c(1, 4)])

names	first_last_name
Moe Szyslak	Moe Szyslak
Burns, C. Montgomery	C. Montgomery Burns
Rev. Timothy Lovejoy	Rev. Timothy Lovejoy
Ned Flanders	Ned Flanders
Simpson, Homer	Homer Simpson
Dr. Julius Hibbert	Dr. Julius Hibbert

b. Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).

# str_replace(my.list[7],'Lew','i.e.')
regex1 <- "[[:alpha:]]{2,3}\\."

names_df$has_title <- str_detect(names_df$first_last_name, regex1)
kable(names_df[, c(4, 5)])

first_last_name	has_title
Moe Szyslak	FALSE
C. Montgomery Burns	FALSE
Rev. Timothy Lovejoy	TRUE
Ned Flanders	FALSE
Homer Simpson	FALSE
Dr. Julius Hibbert	TRUE

c. Construct a logical vector indicating whether a character has a second name.

# my.list[8]
regex2 <- "^[[:alpha:]]\\.\\s"

names_df$has_2nd_name <- str_detect(names_df$first_last_name, regex2)
kable(names_df[, c(4, 6)])

first_last_name	has_2nd_name
Moe Szyslak	FALSE
C. Montgomery Burns	TRUE
Rev. Timothy Lovejoy	FALSE
Ned Flanders	FALSE
Homer Simpson	FALSE
Dr. Julius Hibbert	FALSE

Problem 4

Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression."

a. [0-9]+\$

numeric characters one or more times, followed by a dollar sign

# my.list[9:15]
a <- "[0-9]+\\$"
a.ex <- "3425452345adsfas12351$asdfasdf"
str_extract(a.ex, a)

## [1] "12351$"

b. \b[a-z]{1,4}\b

strings composed of between 1 and 4 lowercase letters that also are word edges at both ends

b <- "\\b[a-z]{1,4}\\b"
b.ex <- " adsfas 54 cs asdff dae4 asds"
str_extract_all(b.ex, b)

## [[1]]
## [1] "cs"   "asds"

c. .*?\.txt$
strings that have any character zero or more times and that end with .txt

c <- ".*?\\.txt$"
c.ex <- "34w54%&$YTAas54.txt"
str_extract_all(c.ex, c)

## [[1]]
## [1] "34w54%&$YTAas54.txt"

d. \d{2}/\d{2}/\d{4}

strings that have exactly 2 digits, followed by a forward slash, then exactly 2 digits and a forward slash, followed by exactly 4 digits

d <- "\\d{2}/\\d{2}/\\d{4}"
d.ex <- "1/1/2011 12-05-2025sas22/43/520033551"
str_extract_all(d.ex, d)

## [[1]]
## [1] "22/43/5200"

e. <(.+?)>.+?</\1>

HTML start tag w/o attributes, its content and end tag

e <- "<(.+?)>.+?</\\1>"
e.ex <- "wg45qa4d<title> 12-05-2025sas22/43/520033551</title>dfasdfadsf"
str_extract_all(e.ex, e)

## [[1]]
## [1] "<title> 12-05-2025sas22/43/520033551</title>"

Problem 9

One following code hides a secret message, Crack it whh R and regular expressions. Hint: Some of the characters are more revealing than others

jibberish <- getURL("http://www.r-datacollection.com/materials/regex/code_exercise.txt")
answer <- unlist(str_extract_all(jibberish, "[A-Z.]"))
jibberish

## [1] "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr\n"

cat(str_replace(answer, "\\.", " "))

## C O N G R A T U L A T I O N S   Y O U   A R E   A   S U P E R N E R D

D607 Week 3 - Regex Problems

Kyle Gilde

Feb. 12, 2017

The Task

Please deliver links to an R Markdown file (in GitHub and rpubs.com) with solutions to problems 3 and 4 from chapter 8 of Automated Data Collection in R. Problem 9 is extra credit.

Load Packages

Just for fun, I used the optical character recognition package (tesseract) to get the text off of the textbook image. I wanted to see if it could be done. It worked for more than half of the text.

Problem 3

a. Use the tools of this chapter to rearrange the Vector so that all elements conform to the standard First_name last_name.

b. Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).

c. Construct a logical vector indicating whether a character has a second name.

Problem 4

Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression."

a. [0-9]+\$

b. \b[a-z]{1,4}\b

c. .*?\.txt$
strings that have any character zero or more times and that end with .txt

d. \d{2}/\d{2}/\d{4}

e. <(.+?)>.+?</\1>

Problem 9

One following code hides a secret message, Crack it whh R and regular expressions. Hint: Some of the characters are more revealing than others

D607 Week 3 - Regex Problems

Kyle Gilde

Feb. 12, 2017

The Task

Please deliver links to an R Markdown file (in GitHub and rpubs.com) with solutions to problems 3 and 4 from chapter 8 of Automated Data Collection in R. Problem 9 is extra credit.

Load Packages

Just for fun, I used the optical character recognition package (tesseract) to get the text off of the textbook image. I wanted to see if it could be done. It worked for more than half of the text.

Problem 3

a. Use the tools of this chapter to rearrange the Vector so that all elements conform to the standard First_name last_name.

b. Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).

c. Construct a logical vector indicating whether a character has a second name.

Problem 4

Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression."

a. [0-9]+\$

b. \b[a-z]{1,4}\b

c. .*?\.txt$ strings that have any character zero or more times and that end with .txt

d. \d{2}/\d{2}/\d{4}

e. <(.+?)>.+?</\1>

Problem 9

One following code hides a secret message, Crack it whh R and regular expressions. Hint: Some of the characters are more revealing than others

c. .*?\.txt$
strings that have any character zero or more times and that end with .txt