CUNY MSDS DATA 607 HW 3

library(stringr)
library(kableExtra)
library(knitr)

3.) Copy the introductory example. The vector name stores the extracted names.

Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.
Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).
Construct a logical vector indicating whether a character has a second name.

raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"

Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.

#extract words
originalNames <-  unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))

originalNames

## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

#extract middle names 
names1 <- str_replace(originalNames, "\\s[A-z]\\. ", " ")
names1

## [1] "Moe Szyslak"          "Burns, Montgomery"    "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

# change first name to last- found word groups then changed them 
names2 <- str_replace(names1, "(\\w+),\\s(\\w+)", "\\2 \\1")
names2

## [1] "Moe Szyslak"          "Montgomery Burns"     "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Homer Simpson"        "Dr. Julius Hibbert"

#Removed title of name of Rev. Timothy Lovejoy, Dr. Julius Hibbert
newNames <- str_replace(names2, "[A-z]{2,3}\\. "," ")

newNames

## [1] "Moe Szyslak"      "Montgomery Burns" " Timothy Lovejoy"
## [4] "Ned Flanders"     "Homer Simpson"    " Julius Hibbert"

originalN <- data.frame(originalNames)
newN <- data.frame(newNames)

# create a table with Kable to list the differences 
kable(list(originalN,newN),caption='Names')

Names

originalNames
Moe Szyslak
Burns, C. Montgomery
Rev. Timothy Lovejoy
Ned Flanders
Simpson, Homer
Dr. Julius Hibbert

newNames
Moe Szyslak
Montgomery Burns
Timothy Lovejoy
Ned Flanders
Homer Simpson
Julius Hibbert

Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).

title <- str_detect(names2,"[A-z]{2,3}\\. ")
df1 <- data.frame(names2, title)

df1

##                 names2 title
## 1          Moe Szyslak FALSE
## 2     Montgomery Burns FALSE
## 3 Rev. Timothy Lovejoy  TRUE
## 4         Ned Flanders FALSE
## 5        Homer Simpson FALSE
## 6   Dr. Julius Hibbert  TRUE

Construct a logical vector indicating whether a character has a second name.

title <- str_detect(originalNames,"[A-Z]{1}\\." )
df2 <- data.frame(originalNames, title)

df2

##          originalNames title
## 1          Moe Szyslak FALSE
## 2 Burns, C. Montgomery  TRUE
## 3 Rev. Timothy Lovejoy FALSE
## 4         Ned Flanders FALSE
## 5       Simpson, Homer FALSE
## 6   Dr. Julius Hibbert FALSE

4.) Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.

a [0-9]+$ b. c. *?.txt$ d. // e. <(.+?)>.+?</>

[0-9]+$ : Pick numbers 0-9,+ - one or more time, $

ex_one <- "15689142$now!@23$"
str_extract_all(ex_one, "[0-9]+\\$" )

## [[1]]
## [1] "15689142$" "23$"

//b Word Edge, [a-z] letters a-z are lowercase, {1,4} min,max chars

ex_two <- " Today is a day great"
str_extract_all(ex_two, "\\b[a-z]{1,4}\\b")

## [[1]]
## [1] "is"  "a"   "day"

.?.txt$ takes out files: . any chars except line break, / zero or more times, //. literal ., $ end of line

ex_three <- "c:/local/amanda/homework.txt"
str_extract_all(ex_three, ".*?\\.txt$")

## [[1]]
## [1] "c:/local/amanda/homework.txt"

// takes out dates: //d any num, {2} 2 characters, / a ‘/’

ex_four <-  "07/02/1983"

str_extract_all(ex_four, "\\d{2}/\\d{2}/\\d{4}")

## [[1]]
## [1] "07/02/1983"

<(.+?)>.+?</> takes out html code:

ex_five <- "<p>What kind of line is this, a paragraph</p>"
str_extract_all(ex_five, "<(.+?)>.+?</\\1>")

## [[1]]
## [1] "<p>What kind of line is this, a paragraph</p>"

The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others!

data <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
#takes out all capitalperiods and letters 
mystery <- unlist(str_extract_all(data, "[[A-Z].]"))
mystery

##  [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "." "Y"
## [18] "O" "U" "." "A" "R" "E" "." "A" "." "S" "U" "P" "E" "R" "N" "E" "R"
## [35] "D"

# Here I joined the letters together, and removed the spaces
mystery <- paste(mystery, collapse = "") 

#replaced the '.'s with a space
str_replace_all(mystery, "[.]", " ")

## [1] "CONGRATULATIONS YOU ARE A SUPERNERD"

CUNY MSDS DATA 607 HW 3

Amanda Arce

September 16, 2018