DATA607- Assignment 3

Loading packages

I am loading stringr package.

library(stringr)

Problem 3

Copy the introductory example.

Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.

Load data

#Copy the introductory example
raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555
-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"

We are calling str_extract_all funtion from stringr package. It is defined as str_extract_all(string, pattern) such that we first input the string that is to be operated upon and second the expression we are looking for. str_extract_all will extract every match.

#Extract the names and store them in a vector called "name"
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name

## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

### create a function to extract last names
get_last <- function(list){
    last <- str_extract(list, '[[:alpha:]]{1,}\\,|\\b [[:alpha:]]{2,}')
    last <- str_extract(last, "[[:alpha:]]{1,}")
    last
}

### create a function to extract first names
get_first <- function(list) {
    first <- str_extract(list, '[[:alpha:]]{1,} |\\. [[:alpha:]]{1,}|\\, [[:alpha:]]{2,}')
    first <- str_extract(first, "[[:alpha:]]{1,}")
    first
}

# run functions to create a dataframe called "namedf"
namedf <- data.frame(first = get_first(name), 
                      last = get_last(name))
namedf

# get full name
fullname <- paste(namedf$first,'', namedf$last)
fullname

## [1] "Moe  Szyslak"      "Montgomery  Burns" "Timothy  Lovejoy" 
## [4] "Ned  Flanders"     "Homer  Simpson"    "Julius  Hibbert"

Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).

# create logicial vector describing if title
nametitle <- str_detect(name, 'Dr.|Rev.')
nametitle

## [1] FALSE FALSE  TRUE FALSE FALSE  TRUE

Construct a logical vector indicating whether a character has a second name.

# create a logical vector describing if second name
secondname <- str_detect(name, ' [:alpha:]{1}\\. ')
secondname

## [1] FALSE  TRUE FALSE FALSE FALSE FALSE

Test the code

# test
raw.datatest <- "333-6589Awsaf Akbar(502) 888-0253Md. Forhad Akbar15165-54654-32546Shamzida Sharmin525"

#Extract the names and store them in a vector called "nametest"
nametest <- unlist(str_extract_all(raw.datatest, "[[:alpha:]., ]{2,}"))
nametest

## [1] "Awsaf Akbar"      "Md. Forhad Akbar" "Shamzida Sharmin"

# run functions to create namedftest
namedftest <- data.frame(first = get_first(nametest), 
                      last = get_last(nametest))
namedftest

# get full name
fullnametest <- paste(namedftest$first,'', namedftest$last)
fullnametest

## [1] "Awsaf  Akbar"      "Forhad  Akbar"     "Shamzida  Sharmin"

# create logicial vector describing if title
nametitletest <- str_detect(nametest, 'Md.')
nametitletest

## [1] FALSE  TRUE FALSE

# create a logical vector describing if second name
secondnametest <- str_detect(nametest, ' [:alpha:]{1}\\. ')
secondnametest

## [1] FALSE FALSE FALSE

Problem 4

Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.

I will try to explain each regular expression in details and come up with at least two different examples.

[0-9]+\$ The [0-9]+ part looks for any string of digits 0 thru 9 that is 1 or more characters long. The two back slahes tells us to regard the $ as a character to be matched, not a metacharacter. Hence, any string of digits followed by a dollar sign would be matched by this regular expression.

Store the pattern in a variable. Then Create two different example strings and test them

#Store the pattern in a variable
pattern_a<- "[0-9]+\\$"
# Create two different example strings
a1 <- "This is a example string: 120$."
# Extract string from example to see if explanation provided is correct
example1<- unlist(str_extract_all(a1, pattern_a))
example1

## [1] "120$"

# Test regular expression
test_a1<-str_detect(a1,pattern_a)
test_a1

## [1] TRUE

# Create example strings
a2<- "240$a12$.5.7$a$"
# Test regular expression to see if explanation provided is correct
example2<- unlist(str_extract_all(a2,pattern_a))
example2

## [1] "240$" "12$"  "7$"

# Test regular expression
test_a2<-str_detect(a2,pattern_a)
test_a2

## [1] TRUE

\b[a-z]{1,4}\b This regular expression looks at each word edge and matches lower case letters at least once, but not more than four times, and then requires there to be a word edge at the end of the string. Therefore, it will only match lower case words that are four characters or less in length.

Create two different example strings and test them

#Store the pattern in a variable
pattern_b<- "\\b[a-z]{1,4}\\b"
# Create example string
b1 <- "I am doing Data 607 assignmnment 3."
# Test regular expression to see if explanation provided is correct
example3<-unlist(str_extract_all(b1, pattern_b))
example3

## [1] "am"

# Test regular expression
test_b1<-str_detect(b1,pattern_b)
test_b1

## [1] TRUE

# Create example string
b2 <- "607-Crown-jewel-data-expression-six-zero-seven-etc$"
# Test regular expression to see if explanation provided is correct
example4<-unlist(str_extract_all(b2, pattern_b))
example4

## [1] "data" "six"  "zero" "etc"

# Test regular expression
test_b2<-str_detect(b2,pattern_b)
test_b2

## [1] TRUE

.*?\.txt$ The dot represents any character. It is followed by an asterisk, which means the character can be matched zero or more times. The question mark tells us that the preceding item is optional, which means we don’t have to have any characters at all. The two backslashes tell us to treat the second dot literally (as a character instead of a metacharacter), which means we’re trying to match “.txt” within the string. The dollar sign tells us the “.txt” should be at the end of the string. This regular expression should match “.txt” or any string of characters followed by “.txt”.

Create two different example strings and test them

#Store the pattern in a variable
pattern_c<- ".*?\\.txt$"
# Create example string
c1 <- "5454#34_2.txt option.png.image dark.txt"
# Test regular expression to see if explanation provided is correct
example5<-unlist(str_extract_all(c1, pattern_c))
example5

## [1] "5454#34_2.txt option.png.image dark.txt"

# Test regular expression
test_c1<-str_detect(c1,pattern_c)
test_c1

## [1] TRUE

# Create example string
c2 <- c(".txt","move.text","data.txt","1$g!1.txt")
# Test regular expression to see if explanation provided is correct
example6<-unlist(str_extract_all(c2, pattern_c))
example6

## [1] ".txt"      "data.txt"  "1$g!1.txt"

test_c2<-str_detect(c1,pattern_c)
test_c2

## [1] TRUE

\d{2}/\d{2}/\d{4} The two backslashes and the ‘d’ looks for numerical digits, and the {x} tells us how many numerical digits to look for. In between three sets of numerical digits the expression looks for the forward slash character. Thus, this regular expression would match any date in a mm/dd/yyyy or dd/mm/yyyy format, or even any string in that format even it was not a valid date (i.e., “34/99/0002”). It would not match any dates that did not use a two-digit day or month, or a year which was not four digits.

Create two different example strings and test them

#Store the pattern in a variable
pattern_d<- "\\d{2}/\\d{2}/\\d{4}"
# Create example string
d1<-c("2/15/2019","04/12/2019","26/03/1985","1/1/1986","34/99/0005","2/3/358")
# Test regular expression to see if explanation provided is correct
example7<-unlist(str_extract_all(d1, pattern_d))
example7

## [1] "04/12/2019" "26/03/1985" "34/99/0005"

# Test regular expression
test_d1<-str_detect(d1,pattern_d)
test_d1

## [1] FALSE  TRUE  TRUE FALSE  TRUE FALSE

# Create example string
d2<-"09/12/2019 09/12/2016 2014/01/04"
# Test regular expression to see if explanation provided is correct
example8<-unlist(str_extract_all(d2, pattern_d))
example8

## [1] "09/12/2019" "09/12/2016"

# Test regular expression
test_d2<-str_detect(d2,pattern_d)
test_d2

## [1] TRUE

<(.+?)>.+?</\1> This regular expression matches any string that starts with ‘<’, followed by one or more characters. Note that the one or more characters part (dot - plus - question mark) is in parentheses. After this, the ‘>’ character is matched, then one or more characters again, and then ‘</’. After this, it matches the same string which was matched earlier using the code inside the aforementioned parentheses (this is what the \1 does). Then, it looks for ‘>’. This regex uses backreferencing to return any string that starts with a <text> and ends with </text>. This would be a good way to search through html or xml.

Create two different example strings and test them

#Store the pattern in a variable
pattern_e<- "<(.+?)>.+?</\\1>"
# Create example string
e1<-c("<tag>Text</tag>","<Font size=4,color=black>Black Text</Font size=4,color=blue>")
# Test regular expression to see if explanation provided is correct
example9<-unlist(str_extract_all(e1, pattern_e))
example9

## [1] "<tag>Text</tag>"

# Test regular expression
test_e1<-str_detect(e1,pattern_e)
test_e1

## [1]  TRUE FALSE

# Create example string
e2<-"<div>hello world</div> <ol><li>one</li><li>two</li></ol>"
# Test regular expression to see if explanation provided is correct
example10<-unlist(str_extract_all(e2, pattern_e))
example10

## [1] "<div>hello world</div>"            "<ol><li>one</li><li>two</li></ol>"

# Test regular expression
test_e2<-str_detect(e2,pattern_e)
test_e2

## [1] TRUE

Problem 9

The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com.

clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr

Step 1

# store the code into a variable called "Secret"
secret <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"

Step 2

# remove all nums, lowercase letters
pattern<- "[[:upper:]]|[[:punct:]]"
cracked <- unlist(str_extract_all(secret, pattern))
cracked

##  [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "." "Y"
## [18] "O" "U" "." "A" "R" "E" "." "A" "." "S" "U" "P" "E" "R" "N" "E" "R"
## [35] "D" "!"

Step 3

# Replace the periods with a space
secret <- str_replace_all(cracked,"\\.", " ")
secret

##  [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" " " "Y"
## [18] "O" "U" " " "A" "R" "E" " " "A" " " "S" "U" "P" "E" "R" "N" "E" "R"
## [35] "D" "!"

Step 4

# Print the secret message
secret_message<- paste(secret, collapse="")
print(secret_message)

## [1] "CONGRATULATIONS YOU ARE A SUPERNERD!"

DATA607- Assignment 3

Assignment 03 by Md Forhad Akbar

2019-09-14

Problem 3

Test the code

Problem 4

Problem 9

Step 1

Step 2

Step 3

Step 4