Please deliver links to an R Markdown file (in GitHub and rpubs.com) with solutions to problems 3 and 4 from chapter 8 of Automated Data Collection in R. Problem 9 is extra credit. You may work in a small group, but please submit separately with names of all group participants in your submission. Here is the referenced code for the introductory example in #3: raw.data<-“555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert” Due end of day Sunday September 16.
library(stringr)
We extract the firstName and store in a vector. Then, we extract the lastNanme and store in a vector. Finally, we put the first name and last name together.
raw.data<-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
name<-unlist(str_extract_all(raw.data,"[[:alpha:]., ]{2,}"))
name
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
firstName<-str_extract_all(unlist(str_extract_all(name,"[:alpha:]{1,25} |, [:print:]{1,25}")),"[A-Z](.+?)+[a-z]")
firstName
## [[1]]
## [1] "Moe"
##
## [[2]]
## [1] "C. Montgomery"
##
## [[3]]
## [1] "Timothy"
##
## [[4]]
## [1] "Ned"
##
## [[5]]
## [1] "Homer"
##
## [[6]]
## [1] "Julius"
lastName<-str_extract_all(unlist(str_extract_all(name,"[a-z] [:alpha:]{1,25}[a-z]$|[:print:]{1,25},")),"[A-z][a-z]+|[A-z][a-z]+,")
lastName
## [[1]]
## [1] "Szyslak"
##
## [[2]]
## [1] "Burns"
##
## [[3]]
## [1] "Lovejoy"
##
## [[4]]
## [1] "Flanders"
##
## [[5]]
## [1] "Simpson"
##
## [[6]]
## [1] "Hibbert"
paste0(firstName,' ',lastName)
## [1] "Moe Szyslak" "C. Montgomery Burns" "Timothy Lovejoy"
## [4] "Ned Flanders" "Homer Simpson" "Julius Hibbert"
We use the extract all function to find all characters with a title
title<-str_extract_all(name,"[:alpha:]{3}[.]|[:alpha:]{2}[.]")
title
## [[1]]
## character(0)
##
## [[2]]
## character(0)
##
## [[3]]
## [1] "Rev."
##
## [[4]]
## character(0)
##
## [[5]]
## character(0)
##
## [[6]]
## [1] "Dr."
logicalTittle<-str_detect(name,"[:alpha:]{3}[.]|[:alpha:]{2}[.]")
logicalTittle
## [1] FALSE FALSE TRUE FALSE FALSE TRUE
So the third and sixth name have the title.
secondName<-str_extract_all(name," [:alpha:]{1}[.]|:alpha:]{1}| [:alpha:] ")
secondName
## [[1]]
## character(0)
##
## [[2]]
## [1] " C."
##
## [[3]]
## character(0)
##
## [[4]]
## character(0)
##
## [[5]]
## character(0)
##
## [[6]]
## character(0)
logicalSecondName<-str_detect(name," [:alpha:]{1}[.]|:alpha:]{1}| [:alpha:] ")
logicalSecondName
## [1] FALSE TRUE FALSE FALSE FALSE FALSE
so the second person has second name.
Returns expression with numbers between 0 and 9 up to the $ simbol [0-9]+ will find many subsequent numbers from 0 to 9 $ will find the literal $ with the making $ a literal and not the end of an expression Four matching and three no-matching expressions shown
text<-c('4587$','486 386$','39874$239','123456 123456$','11111','$111','93837abcde$')
str_extract_all(text,"[0-9]+\\$")
## [[1]]
## [1] "4587$"
##
## [[2]]
## [1] "386$"
##
## [[3]]
## [1] "39874$"
##
## [[4]]
## [1] "123456$"
##
## [[5]]
## character(0)
##
## [[6]]
## character(0)
##
## [[7]]
## character(0)
str_detect(text,"[0-9]+\\$")
## [1] TRUE TRUE TRUE TRUE FALSE FALSE FALSE
search for whole words with characters from “a to z”“, containing just one word, no blank spaces, and need to have 1 to 4 characters Four matching and two no-matching expressions shown
text<-c('hello','well','python is awesome!!!','spyderman@gmail.com','fabulous','one stich saves nine')
str_extract_all(text,"\\b[a-z]{1,4}\\b")
## [[1]]
## character(0)
##
## [[2]]
## [1] "well"
##
## [[3]]
## [1] "is"
##
## [[4]]
## [1] "com"
##
## [[5]]
## character(0)
##
## [[6]]
## [1] "one" "nine"
str_detect(text,"\\b[a-z]{1,4}\\b")
## [1] FALSE TRUE TRUE TRUE FALSE TRUE
four matching and two no-matching expressions shown
text<-c('cunydata607.txt','apple.txt','\\.txt','assignment wk3.txt','readmetxt','txt.py')
str_extract_all(text,".*?\\.txt$")
## [[1]]
## [1] "cunydata607.txt"
##
## [[2]]
## [1] "apple.txt"
##
## [[3]]
## [1] "\\.txt"
##
## [[4]]
## [1] "assignment wk3.txt"
##
## [[5]]
## character(0)
##
## [[6]]
## character(0)
str_detect(text,".*?\\.txt$")
## [1] TRUE TRUE TRUE TRUE FALSE FALSE
Looking for date matches, XX/XX/XXXX. for example: “09/15/2018”" Two matching and two no-matching expressions shown
text<-c('06/01/1969','03/08/1968','4/1/9999','11/11/11')
str_extract_all(text,"\\d{2}/\\d{2}/\\d{4}")
## [[1]]
## [1] "06/01/1969"
##
## [[2]]
## [1] "03/08/1968"
##
## [[3]]
## character(0)
##
## [[4]]
## character(0)
str_detect(text,"\\d{2}/\\d{2}/\\d{4}")
## [1] TRUE TRUE FALSE FALSE
Here they are using backreferenceing to find expressions inside the <> than then repreat after a </ This can be used to find tags in an HTML documents. That is we find a tag start with < someTag >, the () references the tag and the .+? looks for any tag, not one in particular. Then we go thru wahatever the body of the tag might be with .+? after which we are looking for a tag close by looking at </ with finally 1 making reference to the actual body that was opened, that is a reference back to what was inside the starting <> three matching tags and three no-matching tags shown
text<-c('<tag>anatomy</tag>','<tagtag>tag</tagtag>','<tag>nothing to fear</tag>', '<tag>weird<,tag,>','<!tag>impossible<|\tag>','<~tag>legend tag<"tag">')
str_extract_all(text,"<(.+?)>.+?</\\1")
## [[1]]
## [1] "<tag>anatomy</tag"
##
## [[2]]
## [1] "<tagtag>tag</tagtag"
##
## [[3]]
## [1] "<tag>nothing to fear</tag"
##
## [[4]]
## character(0)
##
## [[5]]
## character(0)
##
## [[6]]
## character(0)
str_detect(text,"<(.+?)>.+?</\\1")
## [1] TRUE TRUE TRUE FALSE FALSE FALSE
clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr
code<-'clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hprfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03At5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPalotfb7wEm24k6t3sR9zqe5fy89n6N5t9kc4fE905gmc4Rgxo5nhDk!gr'
try different methods such as all letter in this message, supper case word in the message to find result.
str_extract_all(code,"[a-z]")
## [[1]]
## [1] "c" "l" "c" "o" "p" "o" "w" "z" "m" "s" "t" "c" "d" "w" "n" "k" "i"
## [18] "g" "v" "d" "i" "c" "p" "u" "g" "g" "v" "h" "r" "y" "n" "j" "u" "w"
## [35] "c" "z" "i" "h" "p" "r" "f" "p" "x" "s" "j" "d" "w" "p" "n" "a" "n"
## [52] "w" "o" "w" "i" "s" "d" "i" "j" "j" "k" "p" "f" "t" "d" "r" "c" "o"
## [69] "c" "b" "t" "y" "c" "z" "j" "a" "t" "a" "o" "o" "t" "j" "t" "j" "n"
## [86] "e" "c" "f" "e" "k" "r" "w" "w" "w" "o" "j" "i" "g" "d" "v" "r" "f"
## [103] "r" "b" "z" "b" "k" "n" "b" "h" "z" "g" "v" "i" "z" "c" "r" "o" "p"
## [120] "w" "g" "n" "b" "q" "o" "f" "a" "l" "o" "t" "f" "b" "w" "m" "k" "t"
## [137] "s" "z" "q" "e" "f" "y" "n" "t" "k" "c" "f" "g" "m" "c" "g" "x" "o"
## [154] "n" "h" "k" "g" "r"
str_extract_all(code,"[a-z]+")
## [[1]]
## [1] "clcop" "ow" "zmstc" "d" "wnkig" "vdicp"
## [7] "uggvhryn" "juwczi" "hprfp" "xs" "j" "dwpn"
## [13] "anwo" "wisdij" "j" "kpf" "t" "dr"
## [19] "coc" "bt" "yczjat" "aootj" "t" "j"
## [25] "ne" "c" "fek" "r" "w" "wwojig"
## [31] "d" "vrf" "rbz" "bk" "nbhzgv" "i"
## [37] "z" "crop" "w" "gnb" "qo" "f"
## [43] "alotfb" "w" "m" "k" "t" "s"
## [49] "zqe" "fy" "n" "t" "kc" "f"
## [55] "gmc" "gxo" "nh" "k" "gr"
str_extract_all(code,"[A-Za-z]+")
## [[1]]
## [1] "clcopCow" "zmstc" "d"
## [4] "wnkig" "OvdicpNuggvhryn" "Gjuwczi"
## [7] "hprfpRxs" "Aj" "dwpn"
## [10] "TanwoUwisdij" "Lj" "kpf"
## [13] "At" "Idr" "coc"
## [16] "bt" "yczjatOaootj" "t"
## [19] "Nj" "ne" "c"
## [22] "Sfek" "r" "w"
## [25] "YwwojigOd" "vrfUrbz" "bkAnbhzgv"
## [28] "R" "i" "zEcrop"
## [31] "wAgnb" "SqoU" "fPalotfb"
## [34] "wEm" "k" "t"
## [37] "sR" "zqe" "fy"
## [40] "n" "N" "t"
## [43] "kc" "fE" "gmc"
## [46] "Rgxo" "nhDk" "gr"
codeMessage<-str_extract_all(code,"[A-Z]+")
codeMessage
## [[1]]
## [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "I" "O" "N" "S" "Y" "O" "U"
## [18] "A" "R" "E" "A" "S" "U" "P" "E" "R" "N" "E" "R" "D"
codeMessageString<-paste(unlist(codeMessage),collapse='')
codeMessageString
## [1] "CONGRATULAIONSYOUAREASUPERNERD"
codeMessageString<-str_replace_all(codeMessageString,"SY","S Y")
codeMessageString<-str_replace_all(codeMessageString,"UA","U A")
codeMessageString<-str_replace_all(codeMessageString,"EA","E A")
codeMessageString<-str_replace_all(codeMessageString,"AS","A S")
codeMessageString<-str_replace_all(codeMessageString,"RN","R N")
codeMessageString
## [1] "CONGRATULAIONS YOU ARE A SUPER NERD"