Data 607_Week 3_Assignment

Load libraries and source data

library(stringr)

## Warning: package 'stringr' was built under R version 3.4.3

library(XML)

## Warning: package 'XML' was built under R version 3.4.3

library(RCurl)

## Warning: package 'RCurl' was built under R version 3.4.3

## Loading required package: bitops

library(tau)

## Warning: package 'tau' was built under R version 3.4.3

raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
raw.data

## [1] "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"

3.1. User the tools of the chapter to rearrange the vector so that all elements conform to standard first_name last_name

# extract name data from raw data
name.data <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
# subset strings into different cases based on word order
# first, those beginning with surnames followed by first names, initials, etc.
surname.case <- str_subset(name.data, "\\,+")
sc <- surname.case
sc.first <- matrix(unlist(str_extract_all(sc, "\\b[:alpha:]+$")))
sc.last <- matrix(unlist(str_extract_all(sc, "^[:alpha:]+[^\\, ]")))
sc.cleaned <- cbind(sc.first, sc.last)
# next, those beginning with titles (which we'll also remove and store for later)
title.case <- str_subset(name.data, "\\w{2,3}\\.")
tc <- title.case
tc.first <- matrix(unlist(str_extract_all(tc, "(?<=\\s)[:alpha:]+(?=\\s)")))
tc.last <- matrix(unlist(str_extract_all(tc, "[:alpha:]+$")))
tc.title <- matrix(unlist(str_extract_all(tc, "^\\w{1,3}\\.")))
tc.cleaned <- cbind(tc.first, tc.last)
# next, those in conventional order of firstname lastname
firstlast.case <- str_subset(name.data, "^\\w+\\s\\w+$")
fc <- firstlast.case
fc.cleaned <- matrix(unlist(str_split(fc, "[ ]")), nrow = 2, ncol = 2, byrow = TRUE)
name.data.cleaned <- rbind(sc.cleaned, tc.cleaned, fc.cleaned)
colnames(name.data.cleaned) <- c("first name", "last name")
data.frame(name.data.cleaned)

##   first.name last.name
## 1 Montgomery     Burns
## 2      Homer   Simpson
## 3    Timothy   Lovejoy
## 4     Julius   Hibbert
## 5        Moe   Szyslak
## 6        Ned  Flanders

3.2. Construct a logical vector indicating whether a character has a title (i.e. Rev. and Dr.)

# see 3.1 above
colnames(tc.title) <- "Title"
tc.title

##      Title 
## [1,] "Rev."
## [2,] "Dr."

3.3. Construct a logical vector indicating whether a character has a second name

initial.case <- str_subset(name.data, "\\b[:alpha:]{1}\\.")
ic <- initial.case
ic.initial <- matrix(unlist(str_extract_all(ic, "\\b[:alpha:]{1}\\.")))
colnames(ic.initial) <- "Initial"
ic.initial

##      Initial
## [1,] "C."

4. Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.

4.1. [0-9]+\$

One or more digits followed by a backslash ending the string.

test4.1positive1 <- c("06793028$")
str_detect(test4.1positive1, "[0-9]+\\$")

## [1] TRUE

test4.1positive2 <- c("067$")
str_detect(test4.1positive2, "[0-9]+\\$")

## [1] TRUE

test4.1negative1 <- c("06793028")
str_detect(test4.1negative1, "[0-9]+\\$")

## [1] FALSE

4.2. \b[a-z]{1,4}\b

One to four lower case letters compromising the entirety of the string.

test4.2positive1 <- c("ab")
str_detect(test4.2positive1, "\\b[a-z]{1,4}\\b")

## [1] TRUE

test4.2positive2 <- c("acbz")
str_detect(test4.2positive2, "\\b[a-z]{1,4}\\b")

## [1] TRUE

test4.2negative1 <- c("A")
str_detect(test4.2negative1, "\\b[a-z]{1,4}\\b")

## [1] FALSE

test4.2negative2 <- c("abcxyz")
str_detect(test4.2negative2, "\\b[a-z]{1,4}\\b")

## [1] FALSE

4.3. .*?\.txt$ Any lower case character zero or more times followed by “.txt”" at end of string. (Like .txt file extension, optionally preceded by a file name of any length in lower case).

test4.2positive1 <- c("applesauce.txt")
str_detect(test4.2positive1, ".*?\\.txt$")

## [1] TRUE

test4.2positive2 <- c(".txt")
str_detect(test4.2positive2, ".*?\\.txt$")

## [1] TRUE

test4.2negative1 <- c("applesauce.txt.rmd")
str_detect(test4.2negative1, ".*?\\.txt$")

## [1] FALSE

test4.2negative2 <- c(".TXT")
str_detect(test4.2negative2, ".*?\\.txt$")

## [1] FALSE

4.4. \d{2}/\d{2}/\d{4}

Two digits followed by a forward slash, then two more digits followed by another forward slash, then four digits. (Like mm/dd/yyyy, but without maxima for months and days)

test4.2positive1 <- c("12/07/1941")
str_detect(test4.2positive1, "\\d{2}/\\d{2}/\\d{4}")

## [1] TRUE

test4.2positive2 <- c("20/50/0000")
str_detect(test4.2positive2, "\\d{2}/\\d{2}/\\d{4}")

## [1] TRUE

test4.2negative1 <- c("12\31\1999")
str_detect(test4.2negative1, "\\d{2}/\\d{2}/\\d{4}")

## [1] FALSE

test4.2negative2 <- c("12/7/41")
str_detect(test4.2negative2, "\\d{2}/\\d{2}/\\d{4}")

## [1] FALSE

4.5 <(.+?)>.?</\1>

One, several, or no characters, preceded by a left-angle sign and followed by a right-angle sign. Optionally followed by one character. Then a forward slash and (backreferenced group of) one, several, or not characters, again preceded by a left-angle sign and followed by a right-angle sign.

test4.2positive1 <- c("<hidden agenda>-</hidden agenda>")
str_detect(test4.2positive1, "<(.+?)>.?</\\1>")

## [1] TRUE

test4.2positive2 <- c("< ></ >")
str_detect(test4.2positive2, "<(.+?)>.?</\\1>")

## [1] TRUE

test4.2negative1 <- c("<hidden agenda></hiddenagenda>")
str_detect(test4.2negative1, "<(.+?)>.?</\\1>")

## [1] FALSE