Assignment-3

Start

Load stringr package and the data

library(stringr)
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"

Extract the names from the raw data

name <- unlist(str_extract_all(raw.data, "[[:alpha:],. ]{2,}")) 
name

## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

Problem 3.a)

Step 1. Remove any words end with a period. This will remove all titles and second names

name1 <- str_replace_all(name, pattern = "\\w+\\. ", replacement = "")
name1

## [1] "Moe Szyslak"       "Burns, Montgomery" "Timothy Lovejoy"  
## [4] "Ned Flanders"      "Simpson, Homer"    "Julius Hibbert"

Step 2. Create a function that split a name with comma and flip it so it’s first_name first

correctName <- function(string)
{
  spliting <- str_split(string, pattern = ", ")
  lname <- spliting[[1]][1]
  fname <- spliting[[1]][2]
  return(str_c(fname, lname, sep = " "))
}

Step 3. Use a for-loop to go thru all the names applying the function.

for (i in 1:length(name1))
{
  if (str_detect(name1[i], pattern = ", "))
  { 
    name1[i] <- correctName(name1[i])
  }
}
name1

## [1] "Moe Szyslak"      "Montgomery Burns" "Timothy Lovejoy" 
## [4] "Ned Flanders"     "Homer Simpson"    "Julius Hibbert"

Problem 3.b)

title <- str_detect(name, pattern = "Rev.|Dr.")
data.frame(name, title)

##                   name title
## 1          Moe Szyslak FALSE
## 2 Burns, C. Montgomery FALSE
## 3 Rev. Timothy Lovejoy  TRUE
## 4         Ned Flanders FALSE
## 5       Simpson, Homer FALSE
## 6   Dr. Julius Hibbert  TRUE

Alternatively, below code will catch other titles as well, detecting pattern of words with 2 to 3 letters ending with a period.

str_detect(name, pattern = "\\w{2,3}\\. ")

## [1] FALSE FALSE  TRUE FALSE FALSE  TRUE

Problem 3.c)

second_name <- str_detect(name, pattern = " \\w{1}\\. ")
data.frame(name, second_name)

##                   name second_name
## 1          Moe Szyslak       FALSE
## 2 Burns, C. Montgomery        TRUE
## 3 Rev. Timothy Lovejoy       FALSE
## 4         Ned Flanders       FALSE
## 5       Simpson, Homer       FALSE
## 6   Dr. Julius Hibbert       FALSE

Problem 4.a)

Numbers 0 thru 9 match one or more times ending with dollar sign $

example4a <- "90210$"
str_detect(example4a, "[0-9]+\\$")

## [1] TRUE

Problem 4.b)

A word that is 1 to 4 lower case English letters in length

example4b <- "test"
str_detect(example4b, "\\b[a-z]{1,4}\\b")

## [1] TRUE

Problem 4.c)

A .txt file named with or without any characters.

example4c <- c("testing1.txt", ".txt")
str_detect(example4c, ".*?\\.txt$")

## [1] TRUE TRUE

Problem 4.d)

Date of birth formatted mm/dd/yyyy. Althought the query doesn’t have to be for DOB.

example4d <- c("07/02/2013", "12/34/5678")
str_detect(example4d, "\\d{2}/\\d{2}/\\d{4}")

## [1] TRUE TRUE

Problem 4.e)

This is for HTML code search. A string surrounded by <> brackets, followed by string of any length more than 1, followed by back referencing the same string but added a slash /

example4e <- c("<bold> Any string </bold>", "<646718>3</646718>")
str_detect(example4e, "<(.+?)>.+?</\\1>")

## [1] TRUE TRUE

Problem 9

Here’s the string where there’s a message hidden:

message <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"

I noticed there are four types of characters in this string:

lower case letters [a-z]
upper case letters [A-Z]
digits [0-9]
two punctuation [[.]!]

My first plan was to remove these types one by one from the string to see what’s the result.

func <- c("[0-9]", "[a-z]", "[A-Z]", "[[.]!]")
str_replace_all(message, func, replacement = "")

## [1] "clcopCowzmstcdwnkigOvdicpNuggvhrynGjuwczihqrfpRxsAjdwpnTanwoUwisdijLjkpfATIdrcocbtyczjatOaootjtNjnecSfek.rwYwwojigOdvrfUrbz.bkAnbhzgvRizEcrop.wAgnb.SqoUfPaotfbwEmktsRzqefynNdtkcfEgmcRgxonhDk!gr"                                                  
## [2] "C10877ON92G8R5A50TU7L803AT5I307O553N364S.11YO6U2.2A4R905E.A.SU65P17E2463R95896N594E9054R5D!"                                                                                                                                                        
## [3] "clcopow1zmstc0d87wnkig7vdicpuggvhryn92juwczi8hqrfpxs5j5dwpn0anwowisdij7j8kpf035dr3coc0bt7yczjataootj55t3j3ne6c4fek.r1w1wwojigd6vrfrbz2.2bknbhzgv49i05zcrop.wgnb.qo65fa1otfb7wm24k6t3s9zqe5fy89n6d5t9kc4f905gmc4gxo5nhk!gr"                          
## [4] "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfekr1w1YwwojigOd6vrfUrbz22bkAnbhzgv4R9i05zEcropwAgnbSqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDkgr"

Didn’t look good.

My second try was to remove combination of two of the four types from the string, then combination of three types. And check to see what’s the result.

Here I use combn function to generate combinations, for example:

combn(func, 2)

##      [,1]    [,2]    [,3]     [,4]    [,5]     [,6]    
## [1,] "[0-9]" "[0-9]" "[0-9]"  "[a-z]" "[a-z]"  "[A-Z]" 
## [2,] "[a-z]" "[A-Z]" "[[.]!]" "[A-Z]" "[[.]!]" "[[.]!]"

Here, it generates a matrix, where each column is a combination of the types.

I then wrote a double for-loop and used str_c to combine the types, and removed the combined types from the string message.

combo <- combn(func, 2)
combo_dim <- dim(combo)

# Following double loops splice the func together
for (j in 1:combo_dim[2])
{
  new_func <- combo[1,j]
  for (i in 2:combo_dim[1])
  {
    new_func <- str_c(new_func, "|", combo[i,j])
  }
  
  # Now removing the combined funcs
  new_message <- str_replace_all(message, new_func, replacement = "")
  print(c(new_func, new_message))
}

## [1] "[0-9]|[a-z]"                         
## [2] "CONGRATULATIONS.YOU.ARE.A.SUPERNERD!"
## [1] "[0-9]|[A-Z]"                                                                                                                                                       
## [2] "clcopowzmstcdwnkigvdicpuggvhrynjuwczihqrfpxsjdwpnanwowisdijjkpfdrcocbtyczjataootjtjnecfek.rwwwojigdvrfrbz.bknbhzgvizcrop.wgnb.qofaotfbwmktszqefyndtkcfgmcgxonhk!gr"
## [1] "[0-9]|[[.]!]"                                                                                                                                                                                
## [2] "clcopCowzmstcdwnkigOvdicpNuggvhrynGjuwczihqrfpRxsAjdwpnTanwoUwisdijLjkpfATIdrcocbtyczjatOaootjtNjnecSfekrwYwwojigOdvrfUrbzbkAnbhzgvRizEcropwAgnbSqoUfPaotfbwEmktsRzqefynNdtkcfEgmcRgxonhDkgr"
## [1] "[a-z]|[A-Z]"                                                 
## [2] "1087792855078035307553364.1162.24905..651724639589659490545!"
## [1] "[a-z]|[[.]!]"                                                                          
## [2] "C10877ON92G8R5A50TU7L803AT5I307O553N364S11YO6U22A4R905EASU65P17E2463R95896N594E9054R5D"
## [1] "[A-Z]|[[.]!]"                                                                                                                                                                                                        
## [2] "clcopow1zmstc0d87wnkig7vdicpuggvhryn92juwczi8hqrfpxs5j5dwpn0anwowisdij7j8kpf035dr3coc0bt7yczjataootj55t3j3ne6c4fekr1w1wwojigd6vrfrbz22bknbhzgv49i05zcropwgnbqo65fa1otfb7wm24k6t3s9zqe5fy89n6d5t9kc4f905gmc4gxo5nhkgr"

I found the message! After removing the two types “[0-9]|[a-z]”, the message appears to be “Congratulations you are a supernerd”.

I continued on to remove combination of three types, using the same method, just to see what happens.

combo <- combn(func, 3)
combo_dim <- dim(combo)

# Following double loops splice the func together
for (j in 1:combo_dim[2])
{
  new_func <- combo[1,j]
  for (i in 2:combo_dim[1])
  {
    new_func <- str_c(new_func, "|", combo[i,j])
  }
  
  # Now removing the combined funcs
  new_message <- str_replace_all(message, new_func, replacement = "")
  print(c(new_func, new_message))
}

## [1] "[0-9]|[a-z]|[A-Z]" "....!"            
## [1] "[0-9]|[a-z]|[[.]!]"              "CONGRATULATIONSYOUAREASUPERNERD"
## [1] "[0-9]|[A-Z]|[[.]!]"                                                                                                                                           
## [2] "clcopowzmstcdwnkigvdicpuggvhrynjuwczihqrfpxsjdwpnanwowisdijjkpfdrcocbtyczjataootjtjnecfekrwwwojigdvrfrbzbknbhzgvizcropwgnbqofaotfbwmktszqefyndtkcfgmcgxonhkgr"
## [1] "[a-z]|[A-Z]|[[.]!]"                                     
## [2] "1087792855078035307553364116224905651724639589659490545"

As you can see, after removing “[0-9]|[a-z]|[[.]!]”, the message still can be seen. This is because the message was hidden as upper case in the strings.