Title DATA 607 week 3 assignment Author Rose Koh Date 2018/02/16 Descriptions Regular expressions Rpub Rpub Link Github Github Link
library(stringr)
raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
phone <- unlist(str_extract_all(raw.data, "\\(?(\\d{3})?\\)?(-| )?\\d{3}(-| )?\\d{4}"))
data.frame(name, phone)
## name phone
## 1 Moe Szyslak 555-1239
## 2 Burns, C. Montgomery (636) 555-0113
## 3 Rev. Timothy Lovejoy 555-6542
## 4 Ned Flanders 555 8904
## 5 Simpson, Homer 636-555-3226
## 6 Dr. Julius Hibbert 5553642
#create for loop to change the name order
for(n in 1:length(name)){
if(grepl(',', name[[n]]) == TRUE){ # find a name with ","
spl_name <- unlist(str_split(name[[n]], ",")) #split the name by ","
name[[n]] <- str_c(spl_name[2], " ", spl_name[1]) #merge the splited name with changed order with space between
}
}
data.frame(name, phone)
## name phone
## 1 Moe Szyslak 555-1239
## 2 C. Montgomery Burns (636) 555-0113
## 3 Rev. Timothy Lovejoy 555-6542
## 4 Ned Flanders 555 8904
## 5 Homer Simpson 636-555-3226
## 6 Dr. Julius Hibbert 5553642
# Rev., Dr.
title <- unlist(str_extract_all(name, "[:alpha:]{2,3}\\."))
str_detect(name, title)
## [1] FALSE FALSE TRUE FALSE FALSE TRUE
# or use grepl
title <-grepl("Rev.|Dr.",name)
check.title <- data.frame(name,title)
check.title
## name title
## 1 Moe Szyslak FALSE
## 2 C. Montgomery Burns FALSE
## 3 Rev. Timothy Lovejoy TRUE
## 4 Ned Flanders FALSE
## 5 Homer Simpson FALSE
## 6 Dr. Julius Hibbert TRUE
second_name <- gsub(name, pattern ="Rev. ", replacement = "", fixed = T)
second_name <- gsub(second_name, pattern ="Dr. ", replacement = "", fixed = T)
second_name <- str_extract(second_name, "[[:alpha:]]+\\.")
data.frame(name, !is.na(second_name))
## name X.is.na.second_name.
## 1 Moe Szyslak FALSE
## 2 C. Montgomery Burns TRUE
## 3 Rev. Timothy Lovejoy FALSE
## 4 Ned Flanders FALSE
## 5 Homer Simpson FALSE
## 6 Dr. Julius Hibbert FALSE
# multiplie digits that ends with $
a <- c("10$", "20eur", "30krw", "40rmb", "50gbp")
a.check <- grepl("[0-9]+\\$", a)
data.frame(a, a.check)
## a a.check
## 1 10$ TRUE
## 2 20eur FALSE
## 3 30krw FALSE
## 4 40rmb FALSE
## 5 50gbp FALSE
# lower letter string in the range of 1 to 4 letters
b <- c("a", "ab", "abc", "abcd", "abcde", "a1", "ab2", "abc3", "abcd4")
b.check <- grepl("\\b[a-z]{1,4}\\b", b)
data.frame(b, b.check)
## b b.check
## 1 a TRUE
## 2 ab TRUE
## 3 abc TRUE
## 4 abcd TRUE
## 5 abcde FALSE
## 6 a1 FALSE
## 7 ab2 FALSE
## 8 abc3 FALSE
## 9 abcd4 FALSE
# any string that ends with .txt
c <- c("abc.pdf", "123.png", "a1b2c3.jpeg", "abc.txt", "a1b2c3.txt", "1_2_3.txt")
c.check <- grepl(".?\\.txt$", c)
data.frame(c, c.check)
## c c.check
## 1 abc.pdf FALSE
## 2 123.png FALSE
## 3 a1b2c3.jpeg FALSE
## 4 abc.txt TRUE
## 5 a1b2c3.txt TRUE
## 6 1_2_3.txt TRUE
# 2 digits followed by / then again 2 digits followed by / then another 4 digits
d <- c("12341234", "12/12/1234", "ab/c2/de23", "00000000", "00/00/0000", "00-00-0000")
d.check <- grepl("\\d{2}/\\d{2}/\\d{4}", d)
data.frame(d, d.check)
## d d.check
## 1 12341234 FALSE
## 2 12/12/1234 TRUE
## 3 ab/c2/de23 FALSE
## 4 00000000 FALSE
## 5 00/00/0000 TRUE
## 6 00-00-0000 FALSE
# '<' + followed by 1 or more characters +'>' + one or more characters (can't be empty) + \1 looks for the first string at head.
# The string structure is similar to html file, a string that contains pair of <>,</>. Inside the angle brackets, same word needs to be put as \1 indicates. In between the pairs, there must be one more more characters.
e <- c("<head>hello</toe>", "<body>world</body>", "<h1>heading false </h2>", "<p></p>", "<br> example first </br> second</br>")
e.check <- grepl("<(.+?)>.+?</\\1>", e)
data.frame(e, e.check)
## e e.check
## 1 <head>hello</toe> FALSE
## 2 <body>world</body> TRUE
## 3 <h1>heading false </h2> FALSE
## 4 <p></p> FALSE
## 5 <br> example first </br> second</br> TRUE
p9 <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
# str_extract_all(p9, "[[:upper:]]")
# str_extract_all(p9, "[[:upper:].]")
p9 <- unlist(str_extract_all(p9, "[[:upper:].]"))
p9 <- paste0(p9, collapse = "")
p9 <- gsub(p9, pattern= ".", replacement = ". ", fixed = T)
p9
## [1] "CONGRATULATIONS. YOU. ARE. A. SUPERNERD"