library(stringr)
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
names <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
names
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
3.1 Use the tools of this chapter to rearrange the vector so that all the elements conform to the standard first_name last_name format.
In order to get standart first name ,last name we need to remove middle names and titles
remove middle names :
names_no_middle_name <- sub(" [A-z]{1}\\. ","",names)
names_no_middle_name
## [1] "Moe Szyslak" "Burns,Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
remove titles from the names :
names_final <- sub("[A-z]{2,3}\\. ","",names_no_middle_name)
names_final
## [1] "Moe Szyslak" "Burns,Montgomery" "Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Julius Hibbert"
DataFrame
df.names <- data.frame(names_final)
df.names
## names_final
## 1 Moe Szyslak
## 2 Burns,Montgomery
## 3 Timothy Lovejoy
## 4 Ned Flanders
## 5 Simpson, Homer
## 6 Julius Hibbert
3.2 Construct a logical vector indicating whether a character has a title
#Recall the original sample 'name2' from part a
titles <- str_detect(names_no_middle_name, "[[:alpha:]]{2,}\\.")
titles
## [1] FALSE FALSE TRUE FALSE FALSE TRUE
df.titles<- data.frame(names,titles)
df.titles
## names titles
## 1 Moe Szyslak FALSE
## 2 Burns, C. Montgomery FALSE
## 3 Rev. Timothy Lovejoy TRUE
## 4 Ned Flanders FALSE
## 5 Simpson, Homer FALSE
## 6 Dr. Julius Hibbert TRUE
3.3 Contruct a logical vector that indicates if a character has a second name
secondname <- str_detect(names, "[A-Z]\\.{1}")
df.secondname <- data.frame(names,secondname)
df.secondname
## names secondname
## 1 Moe Szyslak FALSE
## 2 Burns, C. Montgomery TRUE
## 3 Rev. Timothy Lovejoy FALSE
## 4 Ned Flanders FALSE
## 5 Simpson, Homer FALSE
## 6 Dr. Julius Hibbert FALSE
4.1 [0-9]+$
Any numbers 0-9 zero or more followed by the dollar $ string
sample <- c("5748900000$","omer35$", "38$","38")
expression = "[0-9]+\\$"
str_detect(sample, expression)
## [1] TRUE TRUE TRUE FALSE
4.2 \b[a-z{1,4}]\b
Any word that has anywhere between 1 to 4 letters
sample_2 <- c("car","cats","door", "hi", "datascience")
expression_2 <-"\\b[a-z]{1,4}\\b"
str_detect(sample_2, expression_2)
## [1] TRUE TRUE TRUE TRUE FALSE
4.3 .*?\.txt$
Any string that ends with a .txt
sample_3 <- c("cars.txt", "txt", "timeseries.txt","code3434.txt")
expression_3 <-".*?\\.txt$"
str_detect(sample_3, expression_3)
## [1] TRUE FALSE TRUE TRUE
4.4 \d{2}/\d{2}/\d{4}
Any Numbers that are written in format dd/dd/dddd
sample_4 <- c("100/1000/10000", "02/12/2019", "2/12/2019")
expression_4 <-"\\d{2}/\\d{2}/\\d{4}"
str_detect(sample_4, expression_4)
## [1] FALSE TRUE FALSE
4.5 <(.+?)>.+?</\1>
Text that starts and ends <> with and also at the end string starts with “/”"
sample_5 <- c("<omer>hello</omer>", "<omer>hello<omer>")
expression_5 <-"<(.+?)>.+?</\\1>"
str_detect(sample_5, expression_5)
## [1] TRUE FALSE
code <-"clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
code
## [1] "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
#Find all uppercase letters
str_extract_all(code, "[[:upper:]]")
## [[1]]
## [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "Y" "O"
## [18] "U" "A" "R" "E" "A" "S" "U" "P" "E" "R" "N" "E" "R" "D"