Question

Please deliver links to an R Markdown file (in GitHub and rpubs.com) with solutions to problems 3 and 4 from chapter 8 of Automated Data Collection in R. Problem 9 is extra credit. You may work in a small group, but please submit separately with names of all group participants in your submission.

Here is the referenced code for the introductory example in #3:

Working

raw.data <-“555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert”

Question 1

  1. Copy the introductory example. The vector name stores the extracted names. R> name [1] “Moe Szyslak” “Burns, C. Montgomery” “Rev. Timothy Lovejoy” [4] “Ned Flanders” “Simpson, Homer” “Dr. Julius Hibbert”
# WORKING 


#-----------------------------Dispaly Name -------------------------------------------
rawdata<-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"

names <- unlist(str_extract_all(rawdata, "[[:alpha:]., ]{2,}"))

names
## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

Question 2

  1. Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.
  • Moe Szyslak
  • Burns C. Montgomery
  • Ned Flanders
  • Simpson, Homer
  • Dr. Julius Hibbert
#-------------------Dispaly 1st and Last name  --------------------

#-------------------------
# Target to acheive
#First Name  || Last Name |
#-------------------------
# Moe           Szyslak         
# Burns         C. Montgomery 
# 
# Ned           Flanders      
# Simpson,      Homer
# Dr. Julius    Hibbert
#-------------------------

#-------FM which returns postion of maching string in vector.
grepall <- function(pattern, x,
ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE,
value=FALSE, logic=FALSE){
  
  # error and exception handling
if(length(pattern)==0 | length(x)==0){
warning("Length of pattern or data equals zero.")
return(NULL)
}
# apply grepl() and all()
indicies <- sapply(pattern, grepl, x,
ignore.case, perl, fixed, useBytes)
index <- apply(indicies, 1, all)
# indexation and return of results
if(logic==T) return(index)
if(value==F) return((1:length(x))[index])
if(value==T) return(x[index])
}

#------End of FM 

Title <- NA
Title[grepall("Dr.",names)] <- "Dr."
Title[grepall("Rev.",names)] <- "Rev."

tName <- trimws(str_replace_all(names,"Rev.|Dr.",""))
FLName <- trimws(unlist(str_extract_all(tName,"[[:alpha:]]{2,}|[[:alpha:]. ?]{2,}|[.?[:alpha:] ]{2,}",1)))
colnames(FLName) <- c("First Name","Last Name")
datatable(data.frame(Title,"Full Name" = tName,FLName))

Question 3

  1. Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).
#-----------------------------HAS Title (Rev. or Dr.) -------------------------------------------
data.frame(names,Title,"Has_Title"= (grepl(c("Dr."),names)|grepl(c("Rev."),names)))
# OPTION 2 : --------------
# Look for 2/3 character starting with Caps and followed by period #---------------------------- 
data.frame(names,Title,"Has_Title"=
str_detect(names,"\\b[A-Z]{1}[a-z]{1,2}\\."))

Question 4

  1. Construct a logical vector indicating whether a character has a second name.
#-----------------------------HAS 2nd Name----------------------------------------------------------
data.frame("Raw Name " = names, "Full_Name" = tName,
           "Sec_Name" = ifelse(str_count(tName,"[[:alpha:]]+")>2,"TRUE","FALSE"))
#--------------------------------------------------------------------------------------
  • Some other Test
unlist(str_extract_all(names,"[[:alpha:]]{1,3}"),1)
##  [1] "Moe" "Szy" "sla" "k"   "Bur" "ns"  "C"   "Mon" "tgo" "mer" "y"  
## [12] "Rev" "Tim" "oth" "y"   "Lov" "ejo" "y"   "Ned" "Fla" "nde" "rs" 
## [23] "Sim" "pso" "n"   "Hom" "er"  "Dr"  "Jul" "ius" "Hib" "ber" "t"
unlist(str_extract_all(names,"[[:alpha:]]{2,}"))
##  [1] "Moe"        "Szyslak"    "Burns"      "Montgomery" "Rev"       
##  [6] "Timothy"    "Lovejoy"    "Ned"        "Flanders"   "Simpson"   
## [11] "Homer"      "Dr"         "Julius"     "Hibbert"
unlist(str_extract_all(names,"\\w.{2,}"))
## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"
# WORKING  END

Question 5

Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.

    1. [0-9]+\$
    1. \b[a-z]{1,4}\b
    1. .*?\.txt$
    1. \d{2}/\d{2}/\d{4}
    1. <(.+?)>.+?</\1>

Answer (a) [0-9]+\\$

Would retrun only number from 0-9 in contuinues ending with $ at the end.

a <- c("0121210121$ 3423423 1232$ 0932$ 444")
str_extract_all(a,"[0-9]+\\$")
## [[1]]
## [1] "0121210121$" "1232$"       "0932$"

Answer (b) \\b[a-z]{1,4}\\b

Only returns small letter word of exact length 4.

b<-  "nice verynice FIIVE SIIIXX ONE TWO THREE FOUR NICE TINY MANGO msds msdsd 0932$ 444 smal four."
unlist(str_extract_all(b, "\\b[a-z]{1,4}\\b"))
## [1] "nice" "msds" "smal" "four"

Answer (c) .*?\\.txt$ .

data which are like .txt , .?*txt or any text ending “.txt”" will be retrun from this .

c<- c("one.txt","two.*.txt","three.*txt$" , "four.*.txt$", "fine.*.tx" ,"Abcd MSDS max.txt  dot.*txt  onemore.*.txt")
unlist(str_extract_all(c, ".*?\\.txt$"))
## [1] "one.txt"                                   
## [2] "two.*.txt"                                 
## [3] "Abcd MSDS max.txt  dot.*txt  onemore.*.txt"

Answer (d) \\d{2}/\\d{2}/\\d{4}

string starting with number , {2} say only 2 number , then “/” and again 2 number , then “/” and then 4 number.

d<- c("12/12/201","1/1/2012","01/12/2019", "00/0/2019","12/02/2019")
unlist(str_extract_all(d, "\\d{2}/\\d{2}/\\d{4}"))
## [1] "01/12/2019" "12/02/2019"

Answer (e) <(.+?)>.+?</\\1>

Uses backreferencing to return any string that starts with a <text> and ends with </text>.

e <- "<meta>MSDS CLASS 607</meta> <ol><li>Assignment</li><li>3e</li></ol>"
unlist(str_extract_all(e, '<(.+?)>.+?</\\1>'))
## [1] "<meta>MSDS CLASS 607</meta>"            
## [2] "<ol><li>Assignment</li><li>3e</li></ol>"

Challange 6

The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com.

clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr

Answer is “CONGRATULATIONS.YOU.ARE.A.SUPERNERD”

ddd<- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"


iconv(ddd, from = "windows-1252", to = "UTF-8")
## [1] "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
Encoding(ddd)
## [1] "unknown"
Encoding("I am rajwant")
## [1] "unknown"
str_extract_all(ddd,"\\d+")
## [[1]]
##  [1] "1"   "0"   "87"  "7"   "92"  "8"   "5"   "5"   "0"   "7"   "8"  
## [12] "03"  "5"   "3"   "0"   "7"   "55"  "3"   "3"   "6"   "4"   "1"  
## [23] "1"   "6"   "2"   "2"   "4"   "9"   "05"  "65"  "1"   "7"   "24" 
## [34] "6"   "3"   "9"   "5"   "89"  "6"   "5"   "9"   "4"   "905" "4"  
## [45] "5"
str_extract_all(ddd,"\\w")
## [[1]]
##   [1] "c" "l" "c" "o" "p" "C" "o" "w" "1" "z" "m" "s" "t" "c" "0" "d" "8"
##  [18] "7" "w" "n" "k" "i" "g" "7" "O" "v" "d" "i" "c" "p" "N" "u" "g" "g"
##  [35] "v" "h" "r" "y" "n" "9" "2" "G" "j" "u" "w" "c" "z" "i" "8" "h" "q"
##  [52] "r" "f" "p" "R" "x" "s" "5" "A" "j" "5" "d" "w" "p" "n" "0" "T" "a"
##  [69] "n" "w" "o" "U" "w" "i" "s" "d" "i" "j" "7" "L" "j" "8" "k" "p" "f"
##  [86] "0" "3" "A" "T" "5" "I" "d" "r" "3" "c" "o" "c" "0" "b" "t" "7" "y"
## [103] "c" "z" "j" "a" "t" "O" "a" "o" "o" "t" "j" "5" "5" "t" "3" "N" "j"
## [120] "3" "n" "e" "6" "c" "4" "S" "f" "e" "k" "r" "1" "w" "1" "Y" "w" "w"
## [137] "o" "j" "i" "g" "O" "d" "6" "v" "r" "f" "U" "r" "b" "z" "2" "2" "b"
## [154] "k" "A" "n" "b" "h" "z" "g" "v" "4" "R" "9" "i" "0" "5" "z" "E" "c"
## [171] "r" "o" "p" "w" "A" "g" "n" "b" "S" "q" "o" "U" "6" "5" "f" "P" "a"
## [188] "1" "o" "t" "f" "b" "7" "w" "E" "m" "2" "4" "k" "6" "t" "3" "s" "R"
## [205] "9" "z" "q" "e" "5" "f" "y" "8" "9" "n" "6" "N" "d" "5" "t" "9" "k"
## [222] "c" "4" "f" "E" "9" "0" "5" "g" "m" "c" "4" "R" "g" "x" "o" "5" "n"
## [239] "h" "D" "k" "g" "r"
str_extract_all(ddd,"\\w[[:punct:]]")
## [[1]]
## [1] "k." "2." "p." "b." "k!"
str_extract_all(ddd,"[[:upper:].]")
## [[1]]
##  [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "." "Y"
## [18] "O" "U" "." "A" "R" "E" "." "A" "." "S" "U" "P" "E" "R" "N" "E" "R"
## [35] "D"
unlist(str_extract_all(ddd,"[A-Z.]"))
##  [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "." "Y"
## [18] "O" "U" "." "A" "R" "E" "." "A" "." "S" "U" "P" "E" "R" "N" "E" "R"
## [35] "D"
paste0(unlist(str_extract_all(ddd,"[A-Z.]")),collapse = "")
## [1] "CONGRATULATIONS.YOU.ARE.A.SUPERNERD"
str_split(paste0(unlist(str_extract_all(ddd,"[A-Z.]")),collapse = ""),
          "\\.",simplify = TRUE)
##      [,1]              [,2]  [,3]  [,4] [,5]       
## [1,] "CONGRATULATIONS" "YOU" "ARE" "A"  "SUPERNERD"

#——————————————————-END ———————————–




str_extract_all(ddd,"\\w[[:punct:]]")
## [[1]]
## [1] "k." "2." "p." "b." "k!"
str_extract_all(ddd,"\\d[[A-Z]]")
## [[1]]
##  [1] "7O" "2G" "5A" "0T" "7L" "3A" "5I" "3N" "4S" "1Y" "4R" "6N" "4R"
unlist(str_extract_all(ddd, '[^[a-z]|[0-9]]'))
##  [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "." "Y"
## [18] "O" "U" "." "A" "R" "E" "." "A" "." "S" "U" "P" "E" "R" "N" "E" "R"
## [35] "D" "!"

Some Other Test

#-----------------------------------------------------------------

grepl(c("Dr."),names)
## [1] FALSE FALSE FALSE FALSE FALSE  TRUE
grepl(c("Rev."),names)
## [1] FALSE FALSE  TRUE FALSE FALSE FALSE
data.frame(names,"DR." = grepl(c("Dr."),names),"Rev." = grepl(c("Rev."),names))
#-----------------------------------"Rev.","Tim"-------------------------------
indicies <- sapply(c("Rev.","Tim"),grepl, names,
ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)

index <- apply(indicies, 1, all) # Retrun strict and match 


#------------------------------------------------------------------------------
# Has 2nd Name 
str_count("asas asasdsa asdasdsad .dsafdfd",pattern = ".")
## [1] 31
data.frame(names,ifelse(str_count(names,"[[:alpha:]]+")>2,"YES","No"),"DR." = grepl(c("Dr."),names),"Rev." = grepl(c("Rev."),names))
data.frame(names,str_detect(names,", ?"),"DR." = grepl(c("Dr."),names),"Rev." = grepl(c("Rev."),names))