Question 03

library(stringr)
raw.data<-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{10,}"))
name

## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

#The above code is taken from textbook

Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.

First change the two reversed cases of Montgomery and Homer

name<-str_replace_all(name, "(.+)(, .+)$", "\\2 \\1")
name

## [1] "Moe Szyslak"           ", C. Montgomery Burns" "Rev. Timothy Lovejoy" 
## [4] "Ned Flanders"          ", Homer Simpson"       "Dr. Julius Hibbert"

Delete the commas in the text

name<-str_replace_all(name, ", ", "")
name

## [1] "Moe Szyslak"          "C. Montgomery Burns"  "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Homer Simpson"        "Dr. Julius Hibbert"

Remove the titles

namefinal<-str_replace_all(name, "[A-Z][a-z]([a-z]?)\\.", "")
namefinal

## [1] "Moe Szyslak"         "C. Montgomery Burns" " Timothy Lovejoy"   
## [4] "Ned Flanders"        "Homer Simpson"       " Julius Hibbert"

b)Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.)

library("knitr")
data<-data.frame(name)
data$title<-str_detect(string=name, pattern = "\\w{2,}\\.")
data$title

## [1] FALSE FALSE  TRUE FALSE FALSE  TRUE

kable(data)

name	title
Moe Szyslak	FALSE
C. Montgomery Burns	FALSE
Rev. Timothy Lovejoy	TRUE
Ned Flanders	FALSE
Homer Simpson	FALSE
Dr. Julius Hibbert	TRUE

c)Construct a logical vector indicating whether a character has a second name.

data$Sname<-str_detect(string = name, pattern = "[A-Z]{1}\\.")
data$Sname

## [1] FALSE  TRUE FALSE FALSE FALSE FALSE

kable(data)

name	title	Sname
Moe Szyslak	FALSE	FALSE
C. Montgomery Burns	FALSE	TRUE
Rev. Timothy Lovejoy	TRUE	FALSE
Ned Flanders	FALSE	FALSE
Homer Simpson	FALSE	FALSE
Dr. Julius Hibbert	TRUE	FALSE

Question 04

[0-9]+\$

a<-c("alphanumeric456", "396$dhyth", "jhah8724627hav$", "real453$estate")
grepl("[0-9]+\\$", a)

## [1] FALSE  TRUE FALSE  TRUE

#The pattern is a string that contains numbers ending with a $
unlist(str_extract_all(a, pattern = "[0-9]+\\$" ))

## [1] "396$" "453$"

\b[a-z]{1,4}\b

b<-c("Samantha", "john", "apple", "banana5", "gre", "gRe")
grepl("\\b[a-z]{1,4}\\b$", b)

## [1] FALSE  TRUE FALSE FALSE  TRUE FALSE

#The pattern is for words not exceeding four characters in all lowercase. 
unlist(str_extract_all(b, pattern = "\\b[a-z]{1,4}\\b" ))

## [1] "john" "gre"

.*?\.txt$

c<-c("fox.txt", "alpha.txtjghh", "alptxt", "1234.txt")
grepl(".*?\\.txt$", c)

## [1]  TRUE FALSE FALSE  TRUE

#The pattern is for any string ending in .txt.
unlist(str_extract_all(c, pattern = ".*?\\.txt$"))

## [1] "fox.txt"  "1234.txt"

\d{2}/\d{2}/\d{4}

d<-c("03/06/2019", "06/05/19", "04/03/2003", "4/5/98")
grepl("\\d{2}/\\d{2}/\\d{4}", d)

## [1]  TRUE FALSE  TRUE FALSE

#This one is for dates in the format xx/xx/xxxx
unlist(str_extract_all(d, pattern="\\d{2}/\\d{2}/\\d{4}"))

## [1] "03/06/2019" "04/03/2003"

<(.+?)>.+?</\1>

e<-c("<html>between</html>", "<boy>girl</boy>", "<boy></boy>", "<girl> </girl>")
grepl("<(.+?)>.+?</\\1>", e)

## [1]  TRUE  TRUE FALSE  TRUE

#the pattern is for <text>sthinbetween</text>. Even a space in between works as in <girl> </girl>.
unlist(str_extract_all(e, pattern="<(.+?)>.+?</\\1>"))

## [1] "<html>between</html>" "<boy>girl</boy>"      "<girl> </girl>"

Question 09

mystery<-"clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"

Looks like the message in is caps, so I am removing the lowercase letters.

mystery1<-gsub(pattern="[[:lower:]]", replacement="", mystery)
mystery1

## [1] "C10877ON92G8R5A50TU7L803AT5I307O553N364S.11YO6U2.2A4R905E.A.SU65P17E2463R95896N594E9054R5D!"

Next I remove all the digits.

mystery2<-gsub(pattern="[[:digit:]]", replacement="", mystery1)
mystery2

## [1] "CONGRATULATIONS.YOU.ARE.A.SUPERNERD!"

Now I have to remove the “.” between the words

mystery_solved<-str_replace_all(mystery2, "\\.", " ")
print(mystery_solved, quote=FALSE)

## [1] CONGRATULATIONS YOU ARE A SUPERNERD!

Data607 Assignment 03

Farhana Zahir

Question 03

Question 04

Question 09