This assignment is about Regular Expression. To solve the problems 3 and 4 from chapter 8 of Automated Data Collection in R following library is used.
library(stringr)
Q: 1 (3.) Copy the introductory example. The vector name stores the extracted names.
R> name [1] “Moe Szyslak” “Burns, C. Montgomery” “Rev. Timothy Lovejoy” [4] “Ned Flanders” “Simpson, Homer” “Dr. Julius Hibbert”
# Actual data from the book Automated Data Collection in R, Chapter 8.
raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson,Homer5553642Dr. Julius Hibbert"
# names from raw data
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson,Homer" "Dr. Julius Hibbert"
A: (a)
#Create data frame to store the names data
names.data <- data.frame(name)
names.data
## name
## 1 Moe Szyslak
## 2 Burns, C. Montgomery
## 3 Rev. Timothy Lovejoy
## 4 Ned Flanders
## 5 Simpson,Homer
## 6 Dr. Julius Hibbert
#Format the data, in case of "Burns, C. Montgomery" remove middle " C. "
names.data$formated_Data <- sub(" [A-z]{1}\\. "," ",names.data$name)
names.data
## name formated_Data
## 1 Moe Szyslak Moe Szyslak
## 2 Burns, C. Montgomery Burns, Montgomery
## 3 Rev. Timothy Lovejoy Rev. Timothy Lovejoy
## 4 Ned Flanders Ned Flanders
## 5 Simpson,Homer Simpson,Homer
## 6 Dr. Julius Hibbert Dr. Julius Hibbert
#Format the data to [first name] [last name] format, in case of "Burns, Montgomery" and "Simpson,Homer" convert them to "Montgomery Burns" and "Homer Simpson" respectively.
names.data$firstName_lastName <- sub("(\\w+),\\s(\\w+)","\\2 \\1",names.data$formated_Data)
names.data$firstName_lastName <- sub("(\\w+),(\\w+)","\\2 \\1",names.data$firstName_lastName)
names.data
## name formated_Data firstName_lastName
## 1 Moe Szyslak Moe Szyslak Moe Szyslak
## 2 Burns, C. Montgomery Burns, Montgomery Montgomery Burns
## 3 Rev. Timothy Lovejoy Rev. Timothy Lovejoy Rev. Timothy Lovejoy
## 4 Ned Flanders Ned Flanders Ned Flanders
## 5 Simpson,Homer Simpson,Homer Homer Simpson
## 6 Dr. Julius Hibbert Dr. Julius Hibbert Dr. Julius Hibbert
(b)
# Find out if character has a title (i.e., Rev. and Dr.).
names.data$hasTitle <- str_detect(names.data$firstName_lastName,"(\\w+)\\.")
names.data
## name formated_Data firstName_lastName hasTitle
## 1 Moe Szyslak Moe Szyslak Moe Szyslak FALSE
## 2 Burns, C. Montgomery Burns, Montgomery Montgomery Burns FALSE
## 3 Rev. Timothy Lovejoy Rev. Timothy Lovejoy Rev. Timothy Lovejoy TRUE
## 4 Ned Flanders Ned Flanders Ned Flanders FALSE
## 5 Simpson,Homer Simpson,Homer Homer Simpson FALSE
## 6 Dr. Julius Hibbert Dr. Julius Hibbert Dr. Julius Hibbert TRUE
(c)
# Find out if character has a second name. Example "Burns, C. Montgomery"
names.data$hasSecondName <- str_detect(names.data$name,"(, \\w+)\\.")
names.data
## name formated_Data firstName_lastName hasTitle
## 1 Moe Szyslak Moe Szyslak Moe Szyslak FALSE
## 2 Burns, C. Montgomery Burns, Montgomery Montgomery Burns FALSE
## 3 Rev. Timothy Lovejoy Rev. Timothy Lovejoy Rev. Timothy Lovejoy TRUE
## 4 Ned Flanders Ned Flanders Ned Flanders FALSE
## 5 Simpson,Homer Simpson,Homer Homer Simpson FALSE
## 6 Dr. Julius Hibbert Dr. Julius Hibbert Dr. Julius Hibbert TRUE
## hasSecondName
## 1 FALSE
## 2 TRUE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
Q: 2 (4.) Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.
[0-9]+\\$
\\b[a-z]{1,4}\\b
.*?\\.txt$
\\d{2}/\\d{2}/\\d{4}
<(.+?)>.+?</\\1>
A: (a)
# Pattern is used to identify numerics followed by $ sign. All digits starting from 0 to 9.
pattern <- "[0-9]+\\$"
num.data <- (c("1357.75$", "ACEG$", "$1234", "3421$", "2590$", "9867.35$", "1$", "123456$"))
str_match(num.data,pattern = pattern)
## [,1]
## [1,] "75$"
## [2,] NA
## [3,] NA
## [4,] "3421$"
## [5,] "2590$"
## [6,] "35$"
## [7,] "1$"
## [8,] "123456$"
#[1,] "75$" -- Value is selected because last 3 characters match the pattern. Data before decimal does not fit the pattern.
#[2,] NA -- Even though string ends with $ sign, it does not fit the numeric pattern.
#[3,] NA -- Dollar sign exists at the begin of the string.
#[4,] "3421$" -- Fits the pattern exactly
#[5,] "2590$" -- Fits the pattern exactly
#[6,] "35$" -- Value is selected because last 3 characters match the pattern. Data before decimal does not fit the pattern.
#[7,] "1$" -- Fits the pattern exactly
#[8,] "123456$" -- Fits the pattern exactly
(b)
# Pattern is used to identify all the words starting with lowercase letters [a-z]. Basically any word of length upto 4 characters but not more than 4 characters matches the pattern. "\\b" marks begining and ending of the word.
pattern <- "\\b[a-z]{1,4}\\b"
string.data <- (c("bats", "Bats", "BATS", "bambi", "boys", "dogs", "cats", "news", "Note books", "bye", "by2020"))
str_match(string.data,pattern = pattern)
## [,1]
## [1,] "bats"
## [2,] NA
## [3,] NA
## [4,] NA
## [5,] "boys"
## [6,] "dogs"
## [7,] "cats"
## [8,] "news"
## [9,] NA
## [10,] "bye"
## [11,] NA
#[1,] "bats" -- Matches exact pattern 4 letter word.
#[2,] NA -- Even though it is a 4 letter word, it starts with uppercase letter
#[3,] NA -- Even though it is a 4 letter word, it starts with uppercase letter
#[4,] NA -- This is 5 letter word, does not match the pattern.
#[5,] "boys" -- Matches exact pattern 4 letter word.
#[6,] "dogs" -- Matches exact pattern 4 letter word.
#[7,] "cats" -- Matches exact pattern 4 letter word.
#[8,] "news" -- Matches exact pattern 4 letter word.
#[9,] NA -- Does not match the pattern. Case has 2 words
#[10,] "bye" -- Matches exact pattern 3 letter word.
#[11,] NA -- Has numerics in the word.
(c)
# Pattern is used to identify files names. Starting of the file name can be any character ".*?" and should be ending with ".txt". "$" marks ending of the word.
pattern <- ".*?\\.txt$"
filenames.data <- (c("one.txt", "2Notepad.txt", "3-4/56.txt", "A B^NewFile.txt", "JustSaved.txtdoc", "oldfile.txt$", "Very Old.xls", "Sample.txt"))
str_match(filenames.data,pattern = pattern)
## [,1]
## [1,] "one.txt"
## [2,] "2Notepad.txt"
## [3,] "3-4/56.txt"
## [4,] "A B^NewFile.txt"
## [5,] NA
## [6,] NA
## [7,] NA
## [8,] "Sample.txt"
#[1,] "one.txt" -- Matches exact pattern has data before .txt and file name ends with txt.
#[2,] "2Notepad.txt" -- Matches exact pattern has data before .txt and file name ends with txt.
#[3,] "3-4/56.txt" -- Matches exact pattern has data before .txt and file name ends with txt.
#[4,] "A B^NewFile.txt" -- Matches exact pattern has data before .txt and file name ends with txt.
#[5,] NA -- Even though .txt exists in the file name, file name does not end with .txt, hence no match.
#[6,] NA -- Even though .txt exists in the file name, file name does not end with .txt, hence no match.
#[7,] NA -- File name does not end with .txt, hence no match.
#[8,] "Sample.txt" -- Matches exact pattern has data before .txt and file name ends with txt.
(d)
# Pattern is used to identify dates. Format of the string should be 2 digits followed by "/" again 2 digits followed by "/" and then 4 digits. Digits can be [0-9], even if it is not valid date.
pattern <- "\\d{2}/\\d{2}/\\d{4}"
date.data <- (c("01/01/2017", "01.01.2016", "21/21/2017", "Jan 01, 2017", "1/01/2017", "01/1/2017", "01/01/17", "00/00/0000", "2017/01/23"))
str_match(date.data,pattern = pattern)
## [,1]
## [1,] "01/01/2017"
## [2,] NA
## [3,] "21/21/2017"
## [4,] NA
## [5,] NA
## [6,] NA
## [7,] NA
## [8,] "00/00/0000"
## [9,] NA
#[1,] "01/01/2017" -- Matches exact pattern 2 digit number followed "/" 2 digits "/" and 4 digits.
#[2,] NA -- Does not match the pattern.
#[3,] "21/21/2017" -- Matches the pattern, even though it is not valid date.
#[4,] NA -- Does not match the pattern
#[5,] NA -- Does not match the pattern
#[6,] NA -- Does not match the pattern, even though it is valid date.
#[7,] NA -- Does not match the pattern, even though it is valid date.
#[8,] "00/00/0000" -- Matches the pattern, even though it is not valid date.
#[9,] NA -- Does not match the pattern.
(e)
# Pattern is used to extract html data. Format should start with "<" and collect all the info before first ">", then collect all the info before "</". All the data collected is valid if the last letter or word is same as data collected first time. Pattern will extract data if "<(.+?)>" and "</\\1>" matches. "\\1" should be same as "(.+?)".
pattern <- "<(.+?)>.+?</\\1>"
htmltag.data <- (c("<tr><td>Info</td></tr>", "<b><i>Info</i></bad>", "<c><i>Info</c></i>", "</b>"))
str_match(htmltag.data,pattern = pattern)
## [,1] [,2]
## [1,] "<tr><td>Info</td></tr>" "tr"
## [2,] "<i>Info</i>" "i"
## [3,] "<c><i>Info</c>" "c"
## [4,] NA NA
# [,1] [,2]
#[1,] "<tr><td>Info</td></tr>" "tr" -- Matches the pattern exactly. First info collected is from <tr>.
#[2,] "<i>Info</i>" "i" -- Matches partially, <b> and </bad> tags are eliminated. and tag <i> is used.
#[3,] "<c><i>Info</c>" "c" -- Matches partially, <c> is mapped to </c> and last tag </i> is eliminated. As first info collected is from <c>.
#[4,] NA NA -- Does not match the pattern.
Q 3 (9.) The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com.
secret.msg <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
secret.msg
## [1] "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
#Extract all lower case letters
pattern <- "[a-z].*?"
str_extract_all(secret.msg, pattern)
## [[1]]
## [1] "c" "l" "c" "o" "p" "o" "w" "z" "m" "s" "t" "c" "d" "w" "n" "k" "i"
## [18] "g" "v" "d" "i" "c" "p" "u" "g" "g" "v" "h" "r" "y" "n" "j" "u" "w"
## [35] "c" "z" "i" "h" "q" "r" "f" "p" "x" "s" "j" "d" "w" "p" "n" "a" "n"
## [52] "w" "o" "w" "i" "s" "d" "i" "j" "j" "k" "p" "f" "d" "r" "c" "o" "c"
## [69] "b" "t" "y" "c" "z" "j" "a" "t" "a" "o" "o" "t" "j" "t" "j" "n" "e"
## [86] "c" "f" "e" "k" "r" "w" "w" "w" "o" "j" "i" "g" "d" "v" "r" "f" "r"
## [103] "b" "z" "b" "k" "n" "b" "h" "z" "g" "v" "i" "z" "c" "r" "o" "p" "w"
## [120] "g" "n" "b" "q" "o" "f" "a" "o" "t" "f" "b" "w" "m" "k" "t" "s" "z"
## [137] "q" "e" "f" "y" "n" "d" "t" "k" "c" "f" "g" "m" "c" "g" "x" "o" "n"
## [154] "h" "k" "g" "r"
#Extract all upper case letters
pattern <- "[A-Z].*?"
str_extract_all(secret.msg, pattern)
## [[1]]
## [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "Y" "O"
## [18] "U" "A" "R" "E" "A" "S" "U" "P" "E" "R" "N" "E" "R" "D"
#Message: CONGRATULATIONS YOU ARE A SUPERNERD
References: http://www.stackoverflow.com/questions/33826650/last-name-first-name-to-first-name-last-name