Assignment 3

Question 3

Copy the introductory example. The vector namestores the extracted names.

R> name

[1] “Moe Szyslak” “Burns, C. Montgomery” “Rev. Timothy Lovejoy”

[4] “Ned Flanders” “Simpson, Homer” “Dr. Julius Hibbert”

(a) Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.

library(stringr)
names <- c("Moe Szyslak", "Burns, C. Montgomery", "Rev. Timothy Lovejoy", "Ned Flanders", "Simpson, Homer", "Dr. Julius Hibbert")

names

## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

first_name <- unlist(str_extract_all(names,"[[:alpha:]]+\\s|[.,] [[:alpha:]]+$"))
first_name <- unlist(str_extract_all(first_name, "[[:alpha:]]+"))
first_name

## [1] "Moe"        "Montgomery" "Timothy"    "Ned"        "Homer"     
## [6] "Julius"

last_name <- str_extract(names, "([[:alpha:]]+?[,]|[[:alpha:]]+$)")
last_name <- str_extract(last_name, "[^,]+")
last_name

## [1] "Szyslak"  "Burns"    "Lovejoy"  "Flanders" "Simpson"  "Hibbert"

firstlast <- str_c(first_name, " ", last_name)
firstlast

## [1] "Moe Szyslak"      "Montgomery Burns" "Timothy Lovejoy" 
## [4] "Ned Flanders"     "Homer Simpson"    "Julius Hibbert"

(b) Construct a logical vector indicating whether a character has a title (i.e.,Rev. and Dr.).

title <-str_detect(names, "^[[:alpha:]]+?['.']")
title

## [1] FALSE FALSE  TRUE FALSE FALSE  TRUE

(c) Construct a logical vector indicating whether a character has a second name.

second_name <- str_detect(names, "[[:upper:]]['.']")
second_name

## [1] FALSE  TRUE FALSE FALSE FALSE FALSE

Question 4

Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.

(a) [0-9]+\$

#Returns a string of numbrs of n length ending with the character $

s4a <- c("56424895314$", "534565645656", "dsdf542$65201" , "a442367", "b542346y", "abc123$0$ qw567$$")
unlist(str_extract_all(s4a, "[0-9]+\\$"))

## [1] "56424895314$" "542$"         "123$"         "0$"          
## [5] "567$"

(b) \b[a-z]{1,4}\b

#Returns a string of lowercase letters of length 1, 2, 3, or 4

s4b <- c("ahdb32ja", "a", "ab", "abc", "aBc", "abcd", "123abcd", "ab89@0", "371$12")
unlist(str_extract_all(s4b, "\\b[a-z]{1,4}\\b"))

## [1] "a"    "ab"   "abc"  "abcd"

**(c) .*?\.txt$ **

#Returns a string of length n of any characters (except new line) ending in the text '.txt' or just the text '.txt'

s4c <- c("filename.txt", "file name.txt", "284@$.!.txt", "284@$.!.", "2file name.txt", ".txt", "abc", "2.txt")
unlist(str_extract_all(s4c, ".*?\\.txt$"))

## [1] "filename.txt"   "file name.txt"  "284@$.!.txt"    "2file name.txt"
## [5] ".txt"           "2.txt"

(d) \d{2}/\d{2}/\d{4}

#Returns a date of format xx/xx/xxxx where x is a while number
s4d <- c("42", "33/35/99", "01/18/2199", "33\22\1111")

unlist(str_extract_all(s4d, "\\d{2}/\\d{2}/\\d{4}"))

## [1] "01/18/2199"

(e) <(.+?)>.+?</\1>

#Returns a string what starts with the format <s1> s2 </s1>, where s1 is a string of any length and characters, and s2 is a string of any length. This string usually represents internet tags

s5e <- c("abcd", "4sdf asdf", "2", "  ", "abc xyz /abc", "<html> Hello World 1 </html>", "<a> b </c>", "<a> b </a>", "<b>  </b>", "<abc 123> b@2 </abc 123>")

str_extract_all(s5e, "<(.+?)>.+?</\\1>")

## [[1]]
## character(0)
## 
## [[2]]
## character(0)
## 
## [[3]]
## character(0)
## 
## [[4]]
## character(0)
## 
## [[5]]
## character(0)
## 
## [[6]]
## [1] "<html> Hello World 1 </html>"
## 
## [[7]]
## character(0)
## 
## [[8]]
## [1] "<a> b </a>"
## 
## [[9]]
## [1] "<b>  </b>"
## 
## [[10]]
## [1] "<abc 123> b@2 </abc 123>"

Question 9

The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com.

clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr

unknowntext <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaotj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"

#Remove all lower cases
knowntext <- unlist(str_extract_all(unknowntext, "[[:upper:]]|[[:punct:]]"))
knowntext

##  [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "." "Y"
## [18] "O" "U" "." "A" "R" "E" "." "A" "." "S" "U" "P" "E" "R" "N" "E" "R"
## [35] "D" "!"

#combine into one string
knowntext <- str_c(knowntext, collapse = "")
knowntext

## [1] "CONGRATULATIONS.YOU.ARE.A.SUPERNERD!"

#Split string on the "." characters and put each word in individual element
knowntext <- unlist(str_split(knowntext, "\\."))
knowntext

## [1] "CONGRATULATIONS" "YOU"             "ARE"             "A"              
## [5] "SUPERNERD!"

Assignment 3 - Regex

Dhairav Chhatbar

9/13/2019

Question 3

Question 4

Question 9