Load necessary libraries -
library(stringr)
Copy the introductory example. The vector name stores the extracted names.
raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson,Homer5553642Dr. Julius Hibbert"
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson,Homer" "Dr. Julius Hibbert"
name_1 <- str_replace_all(name,"\\s\\w{1}\\.", "")
name_1
## [1] "Moe Szyslak" "Burns, Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson,Homer" "Dr. Julius Hibbert"
name_2 <- str_replace_all(name_1, "\\w{2,3}\\.","")
name_2
## [1] "Moe Szyslak" "Burns, Montgomery" " Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson,Homer" " Julius Hibbert"
name_3 <- str_replace_all(name_2,"(\\w+),\\s(\\w+)","\\2 \\1")
name_3
## [1] "Moe Szyslak" "Montgomery Burns" " Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson,Homer" " Julius Hibbert"
titlePresent <- str_detect(name,"^\\w+\\.")
titleResult <- data.frame(name, titlePresent)
titleResult
name | titlePresent |
---|---|
Moe Szyslak | FALSE |
Burns, C. Montgomery | FALSE |
Rev. Timothy Lovejoy | TRUE |
Ned Flanders | FALSE |
Simpson,Homer | FALSE |
Dr. Julius Hibbert | TRUE |
secNamePresent <- str_detect(name,"\\s\\w{1}\\.")
secNameResult <- data.frame(name, secNamePresent)
secNameResult
name | secNamePresent |
---|---|
Moe Szyslak | FALSE |
Burns, C. Montgomery | TRUE |
Rev. Timothy Lovejoy | FALSE |
Ned Flanders | FALSE |
Simpson,Homer | FALSE |
Dr. Julius Hibbert | FALSE |
Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.
This expression represents one or more digits followed by a dollar symbol at the end.
patternA <- "[0-9]+\\$"
sampleA <- c("12345$", "87564$","ahdghd$")
str_detect(sampleA,patternA)
## [1] TRUE TRUE FALSE
This expression represents strings including 1 to 4 alphabets in lower case.
patternB <- "\\b[a-z]{1,4}\\b"
sampleB <- c("12345$", "abc","ahdgh")
str_detect(sampleB,patternB)
## [1] FALSE TRUE FALSE
This expression represents strings represents all file names ending with “.txt”.
patternC <- ".*?\\.txt$"
sampleC <- c("test123.txt", "test.xls","ahdgh")
str_detect(sampleC,patternC)
## [1] TRUE FALSE FALSE
This expression represents a set of 2, 2 and 4 digits separated by a slash(“/”) character. Normally this format resents a date string in either MM/DD/YYYY or DD/MM/YYYY.
patternD <- "\\d{2}/\\d{2}/\\d{4}"
sampleD <- c("01/20/2018", "28/02/2018","ahdgh")
str_detect(sampleD,patternD)
## [1] TRUE TRUE FALSE
This expression represents a HTML or XML tag in the format
patternE <- "<(.+?)>.+?</\\1>"
sampleE <- c("<a>This is a tag</a>", "28/02/2018","ahdgh")
str_detect(sampleE,patternE)
## [1] TRUE FALSE FALSE
The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com.
clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr
cypherText <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO
d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5
fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
Step 1. Replace all digits, lowercase characters and space/newline characters with BLANK.
text1 <- str_replace_all(cypherText,"[[:digit:]|[:lower:]|[:space:]]+","")
text1
## [1] "CONGRATULATIONS.YOU.ARE.A.SUPERNERD!"
Step 2. Replace dot(.) with spaces.
text2 <- str_replace_all(text1,"\\."," ")
text2
## [1] "CONGRATULATIONS YOU ARE A SUPERNERD!"