Title DATA 607 week 3 assignment
Author Rose Koh
Date 2018/02/16
Descriptions Regular expressions
Rpub Rpub Link
Github Github Link

Assignments

  1. Automated Data Collection in R (Chapter 8) solution to problems 3,4
  2. Problem 9 is extra credit

Requirements

  1. R markdown code
  2. Share on Github, Rpub.
library(stringr)

3. Copy the introductory example. The vector name stores the extracted names.

raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"

name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
phone <- unlist(str_extract_all(raw.data, "\\(?(\\d{3})?\\)?(-| )?\\d{3}(-| )?\\d{4}"))
data.frame(name, phone)
##                   name          phone
## 1          Moe Szyslak       555-1239
## 2 Burns, C. Montgomery (636) 555-0113
## 3 Rev. Timothy Lovejoy       555-6542
## 4         Ned Flanders       555 8904
## 5       Simpson, Homer   636-555-3226
## 6   Dr. Julius Hibbert        5553642
#create for loop to change the name order
for(n in 1:length(name)){
    if(grepl(',', name[[n]]) == TRUE){ # find a name with ","
    spl_name <- unlist(str_split(name[[n]], ",")) #split the name by ","
    name[[n]] <- str_c(spl_name[2], " ", spl_name[1]) #merge the splited name with changed order with space between
  }
}

data.frame(name, phone)
##                   name          phone
## 1          Moe Szyslak       555-1239
## 2  C. Montgomery Burns (636) 555-0113
## 3 Rev. Timothy Lovejoy       555-6542
## 4         Ned Flanders       555 8904
## 5        Homer Simpson   636-555-3226
## 6   Dr. Julius Hibbert        5553642
# Rev., Dr.
title <- unlist(str_extract_all(name, "[:alpha:]{2,3}\\."))
str_detect(name, title)
## [1] FALSE FALSE  TRUE FALSE FALSE  TRUE
# or use grepl
title <-grepl("Rev.|Dr.",name)
check.title <- data.frame(name,title)
check.title
##                   name title
## 1          Moe Szyslak FALSE
## 2  C. Montgomery Burns FALSE
## 3 Rev. Timothy Lovejoy  TRUE
## 4         Ned Flanders FALSE
## 5        Homer Simpson FALSE
## 6   Dr. Julius Hibbert  TRUE
second_name <- gsub(name, pattern ="Rev. ", replacement = "", fixed = T)
second_name <- gsub(second_name, pattern ="Dr. ", replacement = "", fixed = T)
second_name <- str_extract(second_name, "[[:alpha:]]+\\.")
data.frame(name, !is.na(second_name))
##                   name X.is.na.second_name.
## 1          Moe Szyslak                FALSE
## 2  C. Montgomery Burns                 TRUE
## 3 Rev. Timothy Lovejoy                FALSE
## 4         Ned Flanders                FALSE
## 5        Homer Simpson                FALSE
## 6   Dr. Julius Hibbert                FALSE

4. Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.

a. “[0-9]+\$”
# multiplie digits that ends with $
a <- c("10$", "20eur", "30krw", "40rmb", "50gbp")
a.check <- grepl("[0-9]+\\$", a)
data.frame(a, a.check)
##       a a.check
## 1   10$    TRUE
## 2 20eur   FALSE
## 3 30krw   FALSE
## 4 40rmb   FALSE
## 5 50gbp   FALSE
b. “\b[a-z]{1,4}\b”
# lower letter string in the range of 1 to 4 letters 
b <- c("a", "ab", "abc", "abcd", "abcde", "a1", "ab2", "abc3", "abcd4")
b.check <- grepl("\\b[a-z]{1,4}\\b", b)
data.frame(b, b.check)
##       b b.check
## 1     a    TRUE
## 2    ab    TRUE
## 3   abc    TRUE
## 4  abcd    TRUE
## 5 abcde   FALSE
## 6    a1   FALSE
## 7   ab2   FALSE
## 8  abc3   FALSE
## 9 abcd4   FALSE
c. “.?.txt$”
# any string that ends with .txt
c <- c("abc.pdf", "123.png", "a1b2c3.jpeg", "abc.txt", "a1b2c3.txt", "1_2_3.txt")
c.check <- grepl(".?\\.txt$", c)
data.frame(c, c.check)
##             c c.check
## 1     abc.pdf   FALSE
## 2     123.png   FALSE
## 3 a1b2c3.jpeg   FALSE
## 4     abc.txt    TRUE
## 5  a1b2c3.txt    TRUE
## 6   1_2_3.txt    TRUE
d. “//”
# 2 digits followed by / then again 2 digits followed by / then another 4 digits
d <- c("12341234", "12/12/1234", "ab/c2/de23", "00000000", "00/00/0000", "00-00-0000")
d.check <- grepl("\\d{2}/\\d{2}/\\d{4}", d)
data.frame(d, d.check)
##            d d.check
## 1   12341234   FALSE
## 2 12/12/1234    TRUE
## 3 ab/c2/de23   FALSE
## 4   00000000   FALSE
## 5 00/00/0000    TRUE
## 6 00-00-0000   FALSE
e. “<(.+?)>.+?</>”
# '<' + followed by 1 or more characters +'>' + one or more characters (can't be empty) + \1 looks for the first string at head.
# The string structure is similar to html file, a string that contains pair of <>,</>. Inside the angle brackets, same word needs to be put as \1 indicates.  In between the pairs, there must be one more more characters.

e <- c("<head>hello</toe>", "<body>world</body>", "<h1>heading false </h2>", "<p></p>", "<br> example first </br> second</br>")
e.check <- grepl("<(.+?)>.+?</\\1>", e)
data.frame(e, e.check)
##                                      e e.check
## 1                    <head>hello</toe>   FALSE
## 2                   <body>world</body>    TRUE
## 3              <h1>heading false </h2>   FALSE
## 4                              <p></p>   FALSE
## 5 <br> example first </br> second</br>    TRUE

9. The following code hides a secret message. Crack it with R and regular expressions. Some of the caracters are more revealing than others.

p9 <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"

# str_extract_all(p9, "[[:upper:]]")
# str_extract_all(p9, "[[:upper:].]")

p9 <- unlist(str_extract_all(p9, "[[:upper:].]"))
p9 <- paste0(p9, collapse = "")
p9 <- gsub(p9, pattern= ".", replacement = ". ", fixed = T)
p9
## [1] "CONGRATULATIONS. YOU. ARE. A. SUPERNERD"