1 Pre-Requistes : Available Libraries

2 Problems

2.1 Problem 3.

Copy the introductory example. The vector name stores the extracted names.

raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
raw.data

## [1] "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"

typeof(raw.data) # find the data type of variable : raw.data

## [1] "character"

Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.

## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

## [1] "character"

##      [,1]                   [,2]           
## [1,] "Moe Szyslak"          ""             
## [2,] "Burns"                "C. Montgomery"
## [3,] "Rev. Timothy Lovejoy" ""             
## [4,] "Ned Flanders"         ""             
## [5,] "Simpson"              "Homer"        
## [6,] "Dr. Julius Hibbert"   ""

## [1] "character"

## [1] " Moe Szyslak"          "C. Montgomery Burns"   " Rev. Timothy Lovejoy"
## [4] " Ned Flanders"         "Homer Simpson"         " Dr. Julius Hibbert"

## [1] "character"

## [1] " Moe Szyslak"     "C Burns"          " Timothy Lovejoy"
## [4] " Ned Flanders"    "Homer Simpson"    " Julius Hibbert"

library(kableExtra)
kable(list(originalNames, newNames), "html", escape = F) %>%
  kable_styling("striped", full_width = T) %>%
  column_spec(1, bold = T) %>%
  row_spec(1, bold = T, italic = T, underline = TRUE)

original_names
Moe Szyslak
Burns, C. Montgomery
Rev. Timothy Lovejoy
Ned Flanders
Simpson, Homer
Dr. Julius Hibbert

new_names
Moe Szyslak
C Burns
Timothy Lovejoy
Ned Flanders
Homer Simpson
Julius Hibbert

kable(data.frame(originalNames, newNames), format = "markdown")

original_names	new_names
Moe Szyslak	Moe Szyslak
Burns, C. Montgomery	C Burns
Rev. Timothy Lovejoy	Timothy Lovejoy
Ned Flanders	Ned Flanders
Simpson, Homer	Homer Simpson
Dr. Julius Hibbert	Julius Hibbert

library(DT)
DT::datatable(data.frame(original_names,new_names), options = list(pagelength=10))

Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).

has_title <- str_detect(first_last_name,"[A-z]{2,3}\\. ")
title_check <- data.frame(first_last_name,has_title)
title_check

##         first_last_name has_title
## 1           Moe Szyslak     FALSE
## 2   C. Montgomery Burns     FALSE
## 3  Rev. Timothy Lovejoy      TRUE
## 4          Ned Flanders     FALSE
## 5         Homer Simpson     FALSE
## 6    Dr. Julius Hibbert      TRUE

Construct a logical vector indicating whether a character has a second name.

has_2nd_name <- str_detect(original_names," [A-z]{1}\\. ")
Second_name_check <- data.frame(first_last_name,has_2nd_name)
Second_name_check

##         first_last_name has_2nd_name
## 1           Moe Szyslak        FALSE
## 2   C. Montgomery Burns         TRUE
## 3  Rev. Timothy Lovejoy        FALSE
## 4          Ned Flanders        FALSE
## 5         Homer Simpson        FALSE
## 6    Dr. Julius Hibbert        FALSE

2.2 Problem 4.

Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.

2.2.1 (a) [0-9]+\$

Matches string patterns comprising 0-9 digits one or more times followed by $ sign only.

inputList <- c("$100", "50$", "1$30c", "$t@1!$", "1010NYStreet", "1080px$10", "ha100$pp$y", "hello worlds$", "0123456789$")
outputList <- unlist(str_extract_all(inputList,"[0-9]+\\$"))
list(data.frame(inputList),data.frame(outputList))

## [[1]]
##       inputList
## 1          $100
## 2           50$
## 3         1$30c
## 4        $t@1!$
## 5  1010NYStreet
## 6     1080px$10
## 7    ha100$pp$y
## 8 hello worlds$
## 9   0123456789$
## 
## [[2]]
##    outputList
## 1         50$
## 2          1$
## 3        100$
## 4 0123456789$

#DT::datatable(data.frame(inputList,outputList), options = list(pagelength=10))
#kable(list(inputList, outputList), "html", escape = F) %>%
#  kable_styling("striped", full_width = T) %>%
#  column_spec(1, bold = T) %>%
#  row_spec(1, bold = T, italic = T, underline = TRUE)
plyr::ldply(list(inputList,outputList), rbind)

##      1   2     3           4            5         6          7
## 1 $100 50$ 1$30c      $t@1!$ 1010NYStreet 1080px$10 ha100$pp$y
## 2  50$  1$  100$ 0123456789$         <NA>      <NA>       <NA>
##               8           9
## 1 hello worlds$ 0123456789$
## 2          <NA>        <NA>

2.2.2 (b) \b[a-z]{1,4}\b

Matches words having between one and four lower-case characters only.

inputList <- c("one two three f0ur five $ix $even eight nine ten", "Good Night", "The quick brown fox jumps over the lazy dog")
outputList <- unlist(str_extract_all(inputList,"\\b[a-z]{1,4}\\b"))
list(data.frame(inputList),data.frame(outputList))

## [[1]]
##                                          inputList
## 1 one two three f0ur five $ix $even eight nine ten
## 2                                       Good Night
## 3      The quick brown fox jumps over the lazy dog
## 
## [[2]]
##    outputList
## 1         one
## 2         two
## 3        five
## 4          ix
## 5        even
## 6        nine
## 7         ten
## 8         fox
## 9        over
## 10        the
## 11       lazy
## 12        dog

#DT::datatable(data.frame(inputList,outputList), options = list(pagelength=10))
#kable(list(inputList, outputList), "html", escape = F) %>%
#  kable_styling("striped", full_width = T) %>%
#  column_spec(1, bold = F) %>%
#  row_spec(1, bold = F, italic = F, underline = FALSE)
plyr::ldply(list(inputList,outputList), rbind)

##                                                  1          2
## 1 one two three f0ur five $ix $even eight nine ten Good Night
## 2                                              one        two
##                                             3    4    5    6    7    8
## 1 The quick brown fox jumps over the lazy dog <NA> <NA> <NA> <NA> <NA>
## 2                                        five   ix even nine  ten  fox
##      9   10   11   12
## 1 <NA> <NA> <NA> <NA>
## 2 over  the lazy  dog

2.2.3 (c) .*?\.txt$

Matches any string that ends in".txt".

inputList <- c("example.txt", "filename.xls", "abc.def.ghi.jkl.mno.pqr.stu.vwx.yz",".txt","FILE.TXT")
outputList <- unlist(str_extract_all(inputList,".*?\\.txt$"))
list(data.frame(inputList),data.frame(outputList))

## [[1]]
##                            inputList
## 1                        example.txt
## 2                       filename.xls
## 3 abc.def.ghi.jkl.mno.pqr.stu.vwx.yz
## 4                               .txt
## 5                           FILE.TXT
## 
## [[2]]
##    outputList
## 1 example.txt
## 2        .txt

#DT::datatable(data.frame(inputList,outputList), options = list(pagelength=10))
#kable(list(inputList, outputList), "html", escape = F) %>%
#  kable_styling("striped", full_width = T) %>%
#  column_spec(1, bold = F) %>%
#  row_spec(1, bold = F, italic = F, underline = FALSE)
plyr::ldply(list(inputList,outputList), rbind)

##             1            2                                  3    4
## 1 example.txt filename.xls abc.def.ghi.jkl.mno.pqr.stu.vwx.yz .txt
## 2 example.txt         .txt                               <NA> <NA>
##          5
## 1 FILE.TXT
## 2     <NA>

2.2.4 (d) \d{2}/\d{2}/\d{4}

 Matches strings with 2 or more digits, followed by "/", then followed 2 more digits, then followed by one more "/" and finally followed by 4 digits

inputList <- c("01/01/2019", "1/Jan/2019", "31/12/2018","ab/cd/efgh","1/1/2019","4712/02/02/2234")
outputList <- unlist(str_extract_all(inputList,"\\d{2}/\\d{2}/\\d{4}"))
list(data.frame(inputList),data.frame(outputList))

## [[1]]
##         inputList
## 1      01/01/2019
## 2      1/Jan/2019
## 3      31/12/2018
## 4      ab/cd/efgh
## 5        1/1/2019
## 6 4712/02/02/2234
## 
## [[2]]
##   outputList
## 1 01/01/2019
## 2 31/12/2018
## 3 02/02/2234

#DT::datatable(data.frame(inputList,outputList), options = list(pagelength=10))
#kable(list(inputList, outputList), "html", escape = F) %>%
#  kable_styling("striped", full_width = T) %>%
#  column_spec(1, bold = F) %>%
#  row_spec(1, bold = F, italic = F, underline = FALSE)
plyr::ldply(list(inputList,outputList), rbind)

##            1          2          3          4        5               6
## 1 01/01/2019 1/Jan/2019 31/12/2018 ab/cd/efgh 1/1/2019 4712/02/02/2234
## 2 01/01/2019 31/12/2018 02/02/2234       <NA>     <NA>            <NA>

2.2.5 (e) <(.+?)>.+?</\1>

 Matches the start tag and text following that and then end tag useful for html and xml.

inputList <- c("<!DOCTYPE html><html><body>Hello World</body></html></html>", "<d> Chinese the year of the pig </d>", "<h3> This will work</h3>", "<h3>This will not<h3>","<a>Hello World!</a>")
outputList <- unlist(str_extract_all(inputList,"<(.+?)>.+?</\\1>"))
list(data.frame(inputList),data.frame(outputList))

## [[1]]
##                                                     inputList
## 1 <!DOCTYPE html><html><body>Hello World</body></html></html>
## 2                        <d> Chinese the year of the pig </d>
## 3                                    <h3> This will work</h3>
## 4                                       <h3>This will not<h3>
## 5                                         <a>Hello World!</a>
## 
## [[2]]
##                              outputList
## 1 <html><body>Hello World</body></html>
## 2  <d> Chinese the year of the pig </d>
## 3              <h3> This will work</h3>
## 4                   <a>Hello World!</a>

#DT::datatable(data.frame(inputList,outputList), options = list(pagelength=10))
#kable(list(inputList, outputList), "html", escape = F) %>%
#  kable_styling("striped", full_width = T) %>%
#  column_spec(1, bold = F) %>%
#  row_spec(1, bold = F, italic = F, underline = FALSE)
plyr::ldply(list(inputList,outputList), rbind)

##                                                             1
## 1 <!DOCTYPE html><html><body>Hello World</body></html></html>
## 2                       <html><body>Hello World</body></html>
##                                      2                        3
## 1 <d> Chinese the year of the pig </d> <h3> This will work</h3>
## 2 <d> Chinese the year of the pig </d> <h3> This will work</h3>
##                       4                   5
## 1 <h3>This will not<h3> <a>Hello World!</a>
## 2   <a>Hello World!</a>                <NA>

2.3 Problem 9.

The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com.

clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr

code <- paste0("clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo",
              "Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO",
              "d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5",
              "fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr")
secret_message <- str_replace_all(paste(unlist(str_extract_all(code, "[[:upper:].]{1,}")), collapse = ''), "[.]", " ");
secret_message

## [1] "CONGRATULATIONS YOU ARE A SUPERNERD"

DATA 607 02 [15961] : Week 3 - R Character Manipulation and Date Processing [02/11 - 02/17]

Debabrata Kabiraj

February 16, 2019