Data607 Assignment

library(stringr)

Problem 3 Both

3-1

raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
regex <- "([a-zA-Z,. ]){2,}"
name <- unlist(str_extract_all(raw.data, regex))
name

## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

str(name)

##  chr [1:6] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy" ...

data.name <- str_remove_all(name,"(\\w+)\\.\\s")
data.name

## [1] "Moe Szyslak"       "Burns, Montgomery" "Timothy Lovejoy"  
## [4] "Ned Flanders"      "Simpson, Homer"    "Julius Hibbert"

data.name <-gsub("(\\w+)\\,\\s(.*)","\\2 \\1", data.name)
df <- data.frame(str_split_fixed(data.name, " ", 2))
colnames(df) <- c("First Name","Last Name")
df

##   First Name Last Name
## 1        Moe   Szyslak
## 2 Montgomery     Burns
## 3    Timothy   Lovejoy
## 4        Ned  Flanders
## 5      Homer   Simpson
## 6     Julius   Hibbert

Problem 3-2

Checking whether the names have a title

str_detect(name, "[A-Za-z]{2,3}\\.")

## [1] FALSE FALSE  TRUE FALSE FALSE  TRUE

# We can see that 3rd and last name contain titles
unlist(str_extract(name, "[A-Za-z]{2,3}\\."))

## [1] NA     NA     "Rev." NA     NA     "Dr."

Problem 3-3

Check whether a character has a 2nd name

# We need to remove the title first
cleaned_names <- str_remove_all(name,"[A-Za-z]{2,3}\\.")
cleaned_names

## [1] "Moe Szyslak"          "Burns, C. Montgomery" " Timothy Lovejoy"    
## [4] "Ned Flanders"         "Simpson, Homer"       " Julius Hibbert"

# let's detect whether any of these have a more than 3 names
str_count(str_trim(cleaned_names), "\\w+") > 2

## [1] FALSE  TRUE FALSE FALSE FALSE FALSE

## Only C. Montomery Burns has a middle name

Problem 4 Both

a. [0-9]+\$ this will extract any digits, ending with a ‘$’

#this will extract any digits, ending with a ampersand
unlist(str_extract_all(c("test23455$eklelkd8989k"), "[0-9]+\\$"))

## [1] "23455$"

b. \b[a-z]{1,4}\b

This will extract and lower case words with a break, less than 4 characters.

unlist(str_extract_all(c("This will extract And lower Case words with a break, less than 4 characters."), "\\b[a-z]{1,4}\\b"))

## [1] "will" "with" "a"    "less" "than"

c .*?\.txt$ #### This will extract files that end with a .txt extension

unlist(str_extract("this sentence contains a filename called t.txt with a .txt extension filename.txt", ".*?\\.txt$"))

## [1] "this sentence contains a filename called t.txt with a .txt extension filename.txt"

paste("The full filename will be extracted because the regex pulls until the ending .txt")

## [1] "The full filename will be extracted because the regex pulls until the ending .txt"

unlist(str_extract("file with a .txt in the name.txt", ".*?\\.txt$"))

## [1] "file with a .txt in the name.txt"

d. \d{2}/\d{2}/\d{4}

This will extract a date pattern, such as 02/19/2018

message <- "This contains 07/04/1776, such as 3494095, but it also contains, such as 33/35/3533444"
paste("The following text will be extracted: ")

## [1] "The following text will be extracted: "

unlist(str_extract_all(message, "\\d{2}/\\d{2}/\\d{4}"))

## [1] "07/04/1776" "33/35/3533"

e. <(.+?)>.+?</\1>

This captures anything that starts with a anything in between, up to the closing tag. Anything after is not captured.

unlist(str_extract_all("Not captured <html><b><i>this is captured</i></html> This is not captured", "<(.+?)>.+?</\\1>"))

## [1] "<html><b><i>this is captured</i></html>"

Problem 9 David

Find the hidden message

hiddenMessage <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo
Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO
d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5
fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
### Showing method below for how I figured it out
# extract letters
unlist(str_extract_all(hiddenMessage, "[[:alpha:]]"))

##   [1] "c" "l" "c" "o" "p" "C" "o" "w" "z" "m" "s" "t" "c" "d" "w" "n" "k"
##  [18] "i" "g" "O" "v" "d" "i" "c" "p" "N" "u" "g" "g" "v" "h" "r" "y" "n"
##  [35] "G" "j" "u" "w" "c" "z" "i" "h" "q" "r" "f" "p" "R" "x" "s" "A" "j"
##  [52] "d" "w" "p" "n" "T" "a" "n" "w" "o" "U" "w" "i" "s" "d" "i" "j" "L"
##  [69] "j" "k" "p" "f" "A" "T" "I" "d" "r" "c" "o" "c" "b" "t" "y" "c" "z"
##  [86] "j" "a" "t" "O" "a" "o" "o" "t" "j" "t" "N" "j" "n" "e" "c" "S" "f"
## [103] "e" "k" "r" "w" "Y" "w" "w" "o" "j" "i" "g" "O" "d" "v" "r" "f" "U"
## [120] "r" "b" "z" "b" "k" "A" "n" "b" "h" "z" "g" "v" "R" "i" "z" "E" "c"
## [137] "r" "o" "p" "w" "A" "g" "n" "b" "S" "q" "o" "U" "f" "P" "a" "o" "t"
## [154] "f" "b" "w" "E" "m" "k" "t" "s" "R" "z" "q" "e" "f" "y" "n" "N" "d"
## [171] "t" "k" "c" "f" "E" "g" "m" "c" "R" "g" "x" "o" "n" "h" "D" "k" "g"
## [188] "r"

# extract upper
unlist(str_extract_all(hiddenMessage, "[A-Z]"))

##  [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "Y" "O"
## [18] "U" "A" "R" "E" "A" "S" "U" "P" "E" "R" "N" "E" "R" "D"

## looks like a sentence, let's extract at the periods
new.hidden <- unlist(str_extract_all(hiddenMessage, "([A-Z\\.])"))
# let's get rid of the sapces
new.hidden <- str_c(new.hidden, collapse = "")
## let's split at the periods
new.hidden

## [1] "CONGRATULATIONS.YOU.ARE.A.SUPERNERD"

## replace periods
final.msg <- gsub("\\.", " ", new.hidden)
tolower(final.msg)

## [1] "congratulations you are a supernerd"

Problem 9 Anthony

Find the Hidden Message

data <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo\nUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO\nd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5\nfy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"

data <- paste(unlist(str_extract_all(data,"[A-Z]")), collapse = "")
gsub("(.{15})(.{3})(.{3})(.{1})(.{5})(.{4})","\\1 \\2 \\3 \\4 \\5 \\6", data)

## [1] "CONGRATULATIONS YOU ARE A SUPER NERD"

Data607 Assignment

David Apolinar, Anthony Muñoz

2/14/2019

Problem 3 Both

3-1

Problem 3-2

Checking whether the names have a title

Problem 3-3

Check whether a character has a 2nd name

Problem 4 Both

a. [0-9]+\$ this will extract any digits, ending with a ‘$’

b. \b[a-z]{1,4}\b

This will extract and lower case words with a break, less than 4 characters.

c .*?\.txt$ #### This will extract files that end with a .txt extension

d. \d{2}/\d{2}/\d{4}

This will extract a date pattern, such as 02/19/2018

e. <(.+?)>.+?</\1>

This captures anything that starts with a anything in between, up to the closing tag. Anything after is not captured.

Problem 9 David

Find the hidden message

Problem 9 Anthony

Find the Hidden Message