Copy the introductory example. The vector name stores the extracted names.
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
raw.data## [1] "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
typeof(raw.data) # find the data type of variable : raw.data ## [1] "character"
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
## [1] "character"
## [,1] [,2]
## [1,] "Moe Szyslak" ""
## [2,] "Burns" "C. Montgomery"
## [3,] "Rev. Timothy Lovejoy" ""
## [4,] "Ned Flanders" ""
## [5,] "Simpson" "Homer"
## [6,] "Dr. Julius Hibbert" ""
## [1] "character"
## [1] " Moe Szyslak" "C. Montgomery Burns" " Rev. Timothy Lovejoy"
## [4] " Ned Flanders" "Homer Simpson" " Dr. Julius Hibbert"
## [1] "character"
## [1] " Moe Szyslak" "C Burns" " Timothy Lovejoy"
## [4] " Ned Flanders" "Homer Simpson" " Julius Hibbert"
library(kableExtra)
kable(list(originalNames, newNames), "html", escape = F) %>%
kable_styling("striped", full_width = T) %>%
column_spec(1, bold = T) %>%
row_spec(1, bold = T, italic = T, underline = TRUE)
|
|
kable(data.frame(originalNames, newNames), format = "markdown")| original_names | new_names |
|---|---|
| Moe Szyslak | Moe Szyslak |
| Burns, C. Montgomery | C Burns |
| Rev. Timothy Lovejoy | Timothy Lovejoy |
| Ned Flanders | Ned Flanders |
| Simpson, Homer | Homer Simpson |
| Dr. Julius Hibbert | Julius Hibbert |
library(DT)
DT::datatable(data.frame(original_names,new_names), options = list(pagelength=10))has_title <- str_detect(first_last_name,"[A-z]{2,3}\\. ")
title_check <- data.frame(first_last_name,has_title)
title_check## first_last_name has_title
## 1 Moe Szyslak FALSE
## 2 C. Montgomery Burns FALSE
## 3 Rev. Timothy Lovejoy TRUE
## 4 Ned Flanders FALSE
## 5 Homer Simpson FALSE
## 6 Dr. Julius Hibbert TRUE
has_2nd_name <- str_detect(original_names," [A-z]{1}\\. ")
Second_name_check <- data.frame(first_last_name,has_2nd_name)
Second_name_check## first_last_name has_2nd_name
## 1 Moe Szyslak FALSE
## 2 C. Montgomery Burns TRUE
## 3 Rev. Timothy Lovejoy FALSE
## 4 Ned Flanders FALSE
## 5 Homer Simpson FALSE
## 6 Dr. Julius Hibbert FALSE
Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.
Matches string patterns comprising 0-9 digits one or more times followed by $ sign only.inputList <- c("$100", "50$", "1$30c", "$t@1!$", "1010NYStreet", "1080px$10", "ha100$pp$y", "hello worlds$", "0123456789$")
outputList <- unlist(str_extract_all(inputList,"[0-9]+\\$"))
list(data.frame(inputList),data.frame(outputList))## [[1]]
## inputList
## 1 $100
## 2 50$
## 3 1$30c
## 4 $t@1!$
## 5 1010NYStreet
## 6 1080px$10
## 7 ha100$pp$y
## 8 hello worlds$
## 9 0123456789$
##
## [[2]]
## outputList
## 1 50$
## 2 1$
## 3 100$
## 4 0123456789$
#DT::datatable(data.frame(inputList,outputList), options = list(pagelength=10))
#kable(list(inputList, outputList), "html", escape = F) %>%
# kable_styling("striped", full_width = T) %>%
# column_spec(1, bold = T) %>%
# row_spec(1, bold = T, italic = T, underline = TRUE)
plyr::ldply(list(inputList,outputList), rbind)## 1 2 3 4 5 6 7
## 1 $100 50$ 1$30c $t@1!$ 1010NYStreet 1080px$10 ha100$pp$y
## 2 50$ 1$ 100$ 0123456789$ <NA> <NA> <NA>
## 8 9
## 1 hello worlds$ 0123456789$
## 2 <NA> <NA>
Matches words having between one and four lower-case characters only.inputList <- c("one two three f0ur five $ix $even eight nine ten", "Good Night", "The quick brown fox jumps over the lazy dog")
outputList <- unlist(str_extract_all(inputList,"\\b[a-z]{1,4}\\b"))
list(data.frame(inputList),data.frame(outputList))## [[1]]
## inputList
## 1 one two three f0ur five $ix $even eight nine ten
## 2 Good Night
## 3 The quick brown fox jumps over the lazy dog
##
## [[2]]
## outputList
## 1 one
## 2 two
## 3 five
## 4 ix
## 5 even
## 6 nine
## 7 ten
## 8 fox
## 9 over
## 10 the
## 11 lazy
## 12 dog
#DT::datatable(data.frame(inputList,outputList), options = list(pagelength=10))
#kable(list(inputList, outputList), "html", escape = F) %>%
# kable_styling("striped", full_width = T) %>%
# column_spec(1, bold = F) %>%
# row_spec(1, bold = F, italic = F, underline = FALSE)
plyr::ldply(list(inputList,outputList), rbind)## 1 2
## 1 one two three f0ur five $ix $even eight nine ten Good Night
## 2 one two
## 3 4 5 6 7 8
## 1 The quick brown fox jumps over the lazy dog <NA> <NA> <NA> <NA> <NA>
## 2 five ix even nine ten fox
## 9 10 11 12
## 1 <NA> <NA> <NA> <NA>
## 2 over the lazy dog
Matches any string that ends in".txt".inputList <- c("example.txt", "filename.xls", "abc.def.ghi.jkl.mno.pqr.stu.vwx.yz",".txt","FILE.TXT")
outputList <- unlist(str_extract_all(inputList,".*?\\.txt$"))
list(data.frame(inputList),data.frame(outputList))## [[1]]
## inputList
## 1 example.txt
## 2 filename.xls
## 3 abc.def.ghi.jkl.mno.pqr.stu.vwx.yz
## 4 .txt
## 5 FILE.TXT
##
## [[2]]
## outputList
## 1 example.txt
## 2 .txt
#DT::datatable(data.frame(inputList,outputList), options = list(pagelength=10))
#kable(list(inputList, outputList), "html", escape = F) %>%
# kable_styling("striped", full_width = T) %>%
# column_spec(1, bold = F) %>%
# row_spec(1, bold = F, italic = F, underline = FALSE)
plyr::ldply(list(inputList,outputList), rbind)## 1 2 3 4
## 1 example.txt filename.xls abc.def.ghi.jkl.mno.pqr.stu.vwx.yz .txt
## 2 example.txt .txt <NA> <NA>
## 5
## 1 FILE.TXT
## 2 <NA>
Matches strings with 2 or more digits, followed by "/", then followed 2 more digits, then followed by one more "/" and finally followed by 4 digitsinputList <- c("01/01/2019", "1/Jan/2019", "31/12/2018","ab/cd/efgh","1/1/2019","4712/02/02/2234")
outputList <- unlist(str_extract_all(inputList,"\\d{2}/\\d{2}/\\d{4}"))
list(data.frame(inputList),data.frame(outputList))## [[1]]
## inputList
## 1 01/01/2019
## 2 1/Jan/2019
## 3 31/12/2018
## 4 ab/cd/efgh
## 5 1/1/2019
## 6 4712/02/02/2234
##
## [[2]]
## outputList
## 1 01/01/2019
## 2 31/12/2018
## 3 02/02/2234
#DT::datatable(data.frame(inputList,outputList), options = list(pagelength=10))
#kable(list(inputList, outputList), "html", escape = F) %>%
# kable_styling("striped", full_width = T) %>%
# column_spec(1, bold = F) %>%
# row_spec(1, bold = F, italic = F, underline = FALSE)
plyr::ldply(list(inputList,outputList), rbind)## 1 2 3 4 5 6
## 1 01/01/2019 1/Jan/2019 31/12/2018 ab/cd/efgh 1/1/2019 4712/02/02/2234
## 2 01/01/2019 31/12/2018 02/02/2234 <NA> <NA> <NA>
Matches the start tag and text following that and then end tag useful for html and xml.inputList <- c("<!DOCTYPE html><html><body>Hello World</body></html></html>", "<d> Chinese the year of the pig </d>", "<h3> This will work</h3>", "<h3>This will not<h3>","<a>Hello World!</a>")
outputList <- unlist(str_extract_all(inputList,"<(.+?)>.+?</\\1>"))
list(data.frame(inputList),data.frame(outputList))## [[1]]
## inputList
## 1 <!DOCTYPE html><html><body>Hello World</body></html></html>
## 2 <d> Chinese the year of the pig </d>
## 3 <h3> This will work</h3>
## 4 <h3>This will not<h3>
## 5 <a>Hello World!</a>
##
## [[2]]
## outputList
## 1 <html><body>Hello World</body></html>
## 2 <d> Chinese the year of the pig </d>
## 3 <h3> This will work</h3>
## 4 <a>Hello World!</a>
#DT::datatable(data.frame(inputList,outputList), options = list(pagelength=10))
#kable(list(inputList, outputList), "html", escape = F) %>%
# kable_styling("striped", full_width = T) %>%
# column_spec(1, bold = F) %>%
# row_spec(1, bold = F, italic = F, underline = FALSE)
plyr::ldply(list(inputList,outputList), rbind)## 1
## 1 <!DOCTYPE html><html><body>Hello World</body></html></html>
## 2 <html><body>Hello World</body></html>
## 2 3
## 1 <d> Chinese the year of the pig </d> <h3> This will work</h3>
## 2 <d> Chinese the year of the pig </d> <h3> This will work</h3>
## 4 5
## 1 <h3>This will not<h3> <a>Hello World!</a>
## 2 <a>Hello World!</a> <NA>
The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com.
clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr
code <- paste0("clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo",
"Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO",
"d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5",
"fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr")
secret_message <- str_replace_all(paste(unlist(str_extract_all(code, "[[:upper:].]{1,}")), collapse = ''), "[.]", " ");
secret_message## [1] "CONGRATULATIONS YOU ARE A SUPERNERD"