Please deliver links to an R Markdown file (in GitHub and rpubs.com) with solutions to problems 3 and 4 from chapter 8 of Automated Data Collection in R. Problem 9 is extra credit. You may work in a small group, but please submit separately with names of all group participants in your submission.
Here is the referenced code for the introductory example in #3:
raw.data <-“555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert”
# WORKING
#-----------------------------Dispaly Name -------------------------------------------
rawdata<-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
names <- unlist(str_extract_all(rawdata, "[[:alpha:]., ]{2,}"))
names
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
#-------------------Dispaly 1st and Last name --------------------
#-------------------------
# Target to acheive
#First Name || Last Name |
#-------------------------
# Moe Szyslak
# Burns C. Montgomery
#
# Ned Flanders
# Simpson, Homer
# Dr. Julius Hibbert
#-------------------------
#-------FM which returns postion of maching string in vector.
grepall <- function(pattern, x,
ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE,
value=FALSE, logic=FALSE){
# error and exception handling
if(length(pattern)==0 | length(x)==0){
warning("Length of pattern or data equals zero.")
return(NULL)
}
# apply grepl() and all()
indicies <- sapply(pattern, grepl, x,
ignore.case, perl, fixed, useBytes)
index <- apply(indicies, 1, all)
# indexation and return of results
if(logic==T) return(index)
if(value==F) return((1:length(x))[index])
if(value==T) return(x[index])
}
#------End of FM
Title <- NA
Title[grepall("Dr.",names)] <- "Dr."
Title[grepall("Rev.",names)] <- "Rev."
tName <- trimws(str_replace_all(names,"Rev.|Dr.",""))
FLName <- trimws(unlist(str_extract_all(tName,"[[:alpha:]]{2,}|[[:alpha:]. ?]{2,}|[.?[:alpha:] ]{2,}",1)))
colnames(FLName) <- c("First Name","Last Name")
datatable(data.frame(Title,"Full Name" = tName,FLName))
#-----------------------------HAS Title (Rev. or Dr.) -------------------------------------------
data.frame(names,Title,"Has_Title"= (grepl(c("Dr."),names)|grepl(c("Rev."),names)))
# OPTION 2 : --------------
# Look for 2/3 character starting with Caps and followed by period #----------------------------
data.frame(names,Title,"Has_Title"=
str_detect(names,"\\b[A-Z]{1}[a-z]{1,2}\\."))
#-----------------------------HAS 2nd Name----------------------------------------------------------
data.frame("Raw Name " = names, "Full_Name" = tName,
"Sec_Name" = ifelse(str_count(tName,"[[:alpha:]]+")>2,"TRUE","FALSE"))
#--------------------------------------------------------------------------------------
unlist(str_extract_all(names,"[[:alpha:]]{1,3}"),1)
## [1] "Moe" "Szy" "sla" "k" "Bur" "ns" "C" "Mon" "tgo" "mer" "y"
## [12] "Rev" "Tim" "oth" "y" "Lov" "ejo" "y" "Ned" "Fla" "nde" "rs"
## [23] "Sim" "pso" "n" "Hom" "er" "Dr" "Jul" "ius" "Hib" "ber" "t"
unlist(str_extract_all(names,"[[:alpha:]]{2,}"))
## [1] "Moe" "Szyslak" "Burns" "Montgomery" "Rev"
## [6] "Timothy" "Lovejoy" "Ned" "Flanders" "Simpson"
## [11] "Homer" "Dr" "Julius" "Hibbert"
unlist(str_extract_all(names,"\\w.{2,}"))
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
# WORKING END
Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.
Answer
(a) [0-9]+\\$
Would retrun only number from 0-9 in contuinues ending with $ at the end.
a <- c("0121210121$ 3423423 1232$ 0932$ 444")
str_extract_all(a,"[0-9]+\\$")
## [[1]]
## [1] "0121210121$" "1232$" "0932$"
Answer
(b) \\b[a-z]{1,4}\\b
Only returns small letter word of exact length 4.
b<- "nice verynice FIIVE SIIIXX ONE TWO THREE FOUR NICE TINY MANGO msds msdsd 0932$ 444 smal four."
unlist(str_extract_all(b, "\\b[a-z]{1,4}\\b"))
## [1] "nice" "msds" "smal" "four"
Answer
(c) .*?\\.txt$ .
data which are like
.txt , .?*txt or any text ending “.txt”" will be retrun from this .
c<- c("one.txt","two.*.txt","three.*txt$" , "four.*.txt$", "fine.*.tx" ,"Abcd MSDS max.txt dot.*txt onemore.*.txt")
unlist(str_extract_all(c, ".*?\\.txt$"))
## [1] "one.txt"
## [2] "two.*.txt"
## [3] "Abcd MSDS max.txt dot.*txt onemore.*.txt"
Answer
(d) \\d{2}/\\d{2}/\\d{4}
string starting with number , {2} say only 2 number , then “/” and again 2 number , then “/” and then 4 number.
d<- c("12/12/201","1/1/2012","01/12/2019", "00/0/2019","12/02/2019")
unlist(str_extract_all(d, "\\d{2}/\\d{2}/\\d{4}"))
## [1] "01/12/2019" "12/02/2019"
Answer
(e) <(.+?)>.+?</\\1>
Uses backreferencing to return any string that starts with a
<text>and ends with</text>.
e <- "<meta>MSDS CLASS 607</meta> <ol><li>Assignment</li><li>3e</li></ol>"
unlist(str_extract_all(e, '<(.+?)>.+?</\\1>'))
## [1] "<meta>MSDS CLASS 607</meta>"
## [2] "<ol><li>Assignment</li><li>3e</li></ol>"
The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com.
clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr
Answer is “CONGRATULATIONS.YOU.ARE.A.SUPERNERD”
ddd<- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
iconv(ddd, from = "windows-1252", to = "UTF-8")
## [1] "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
Encoding(ddd)
## [1] "unknown"
Encoding("I am rajwant")
## [1] "unknown"
str_extract_all(ddd,"\\d+")
## [[1]]
## [1] "1" "0" "87" "7" "92" "8" "5" "5" "0" "7" "8"
## [12] "03" "5" "3" "0" "7" "55" "3" "3" "6" "4" "1"
## [23] "1" "6" "2" "2" "4" "9" "05" "65" "1" "7" "24"
## [34] "6" "3" "9" "5" "89" "6" "5" "9" "4" "905" "4"
## [45] "5"
str_extract_all(ddd,"\\w")
## [[1]]
## [1] "c" "l" "c" "o" "p" "C" "o" "w" "1" "z" "m" "s" "t" "c" "0" "d" "8"
## [18] "7" "w" "n" "k" "i" "g" "7" "O" "v" "d" "i" "c" "p" "N" "u" "g" "g"
## [35] "v" "h" "r" "y" "n" "9" "2" "G" "j" "u" "w" "c" "z" "i" "8" "h" "q"
## [52] "r" "f" "p" "R" "x" "s" "5" "A" "j" "5" "d" "w" "p" "n" "0" "T" "a"
## [69] "n" "w" "o" "U" "w" "i" "s" "d" "i" "j" "7" "L" "j" "8" "k" "p" "f"
## [86] "0" "3" "A" "T" "5" "I" "d" "r" "3" "c" "o" "c" "0" "b" "t" "7" "y"
## [103] "c" "z" "j" "a" "t" "O" "a" "o" "o" "t" "j" "5" "5" "t" "3" "N" "j"
## [120] "3" "n" "e" "6" "c" "4" "S" "f" "e" "k" "r" "1" "w" "1" "Y" "w" "w"
## [137] "o" "j" "i" "g" "O" "d" "6" "v" "r" "f" "U" "r" "b" "z" "2" "2" "b"
## [154] "k" "A" "n" "b" "h" "z" "g" "v" "4" "R" "9" "i" "0" "5" "z" "E" "c"
## [171] "r" "o" "p" "w" "A" "g" "n" "b" "S" "q" "o" "U" "6" "5" "f" "P" "a"
## [188] "1" "o" "t" "f" "b" "7" "w" "E" "m" "2" "4" "k" "6" "t" "3" "s" "R"
## [205] "9" "z" "q" "e" "5" "f" "y" "8" "9" "n" "6" "N" "d" "5" "t" "9" "k"
## [222] "c" "4" "f" "E" "9" "0" "5" "g" "m" "c" "4" "R" "g" "x" "o" "5" "n"
## [239] "h" "D" "k" "g" "r"
str_extract_all(ddd,"\\w[[:punct:]]")
## [[1]]
## [1] "k." "2." "p." "b." "k!"
str_extract_all(ddd,"[[:upper:].]")
## [[1]]
## [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "." "Y"
## [18] "O" "U" "." "A" "R" "E" "." "A" "." "S" "U" "P" "E" "R" "N" "E" "R"
## [35] "D"
unlist(str_extract_all(ddd,"[A-Z.]"))
## [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "." "Y"
## [18] "O" "U" "." "A" "R" "E" "." "A" "." "S" "U" "P" "E" "R" "N" "E" "R"
## [35] "D"
paste0(unlist(str_extract_all(ddd,"[A-Z.]")),collapse = "")
## [1] "CONGRATULATIONS.YOU.ARE.A.SUPERNERD"
str_split(paste0(unlist(str_extract_all(ddd,"[A-Z.]")),collapse = ""),
"\\.",simplify = TRUE)
## [,1] [,2] [,3] [,4] [,5]
## [1,] "CONGRATULATIONS" "YOU" "ARE" "A" "SUPERNERD"
#——————————————————-END ———————————–
str_extract_all(ddd,"\\w[[:punct:]]")
## [[1]]
## [1] "k." "2." "p." "b." "k!"
str_extract_all(ddd,"\\d[[A-Z]]")
## [[1]]
## [1] "7O" "2G" "5A" "0T" "7L" "3A" "5I" "3N" "4S" "1Y" "4R" "6N" "4R"
unlist(str_extract_all(ddd, '[^[a-z]|[0-9]]'))
## [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "." "Y"
## [18] "O" "U" "." "A" "R" "E" "." "A" "." "S" "U" "P" "E" "R" "N" "E" "R"
## [35] "D" "!"
#-----------------------------------------------------------------
grepl(c("Dr."),names)
## [1] FALSE FALSE FALSE FALSE FALSE TRUE
grepl(c("Rev."),names)
## [1] FALSE FALSE TRUE FALSE FALSE FALSE
data.frame(names,"DR." = grepl(c("Dr."),names),"Rev." = grepl(c("Rev."),names))
#-----------------------------------"Rev.","Tim"-------------------------------
indicies <- sapply(c("Rev.","Tim"),grepl, names,
ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)
index <- apply(indicies, 1, all) # Retrun strict and match
#------------------------------------------------------------------------------
# Has 2nd Name
str_count("asas asasdsa asdasdsad .dsafdfd",pattern = ".")
## [1] 31
data.frame(names,ifelse(str_count(names,"[[:alpha:]]+")>2,"YES","No"),"DR." = grepl(c("Dr."),names),"Rev." = grepl(c("Rev."),names))
data.frame(names,str_detect(names,", ?"),"DR." = grepl(c("Dr."),names),"Rev." = grepl(c("Rev."),names))