Please deliver links to an R Markdown file (in GitHub and rpubs.com) with solutions to problems 3 and 4 from chapter 8 of Automated Data Collection in R. Problem 9 is extra credit.
# Need to employ stringr for Regular Expressions
library(stringr)
raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
# Original Name vector obtained by using Regular Expressions
name
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
# Split name for those names that have the last name first separated by a comma
split_name <- str_split(name, ",")
split_name
## [[1]]
## [1] "Moe Szyslak"
##
## [[2]]
## [1] "Burns" " C. Montgomery"
##
## [[3]]
## [1] "Rev. Timothy Lovejoy"
##
## [[4]]
## [1] "Ned Flanders"
##
## [[5]]
## [1] "Simpson" " Homer"
##
## [[6]]
## [1] "Dr. Julius Hibbert"
# Create a data frame to work from there
split_name <- data.frame(split_name)
# Display the new data frame, in order to vizualice the two rows, from here we can observe the difference in between rows when the last name was separated by a comma.
split_name
## X.Moe.Szyslak. c..Burns.....C..Montgomery.. X.Rev..Timothy.Lovejoy.
## 1 Moe Szyslak Burns Rev. Timothy Lovejoy
## 2 Moe Szyslak C. Montgomery Rev. Timothy Lovejoy
## X.Ned.Flanders. c..Simpson.....Homer.. X.Dr..Julius.Hibbert.
## 1 Ned Flanders Simpson Dr. Julius Hibbert
## 2 Ned Flanders Homer Dr. Julius Hibbert
# Assing the "Last Names" Row
ln <- data.frame(split_name[1,])
# Assing the "First Names" Row
fn <- data.frame(split_name[2,])
# Compare rows and proceed to create the desired "Firt Name then Last name" output by employing rbind
split_name <- ifelse(fn == ln, ln , rbind(fn, ln))
split_name
## [[1]]
## [1] Moe Szyslak
## Levels: Moe Szyslak
##
## [[2]]
## [1] C. Montgomery Burns
## Levels: C. Montgomery Burns
##
## [[3]]
## [1] Rev. Timothy Lovejoy
## Levels: Rev. Timothy Lovejoy
##
## [[4]]
## [1] Ned Flanders
## Levels: Ned Flanders
##
## [[5]]
## [1] Homer Simpson
## Levels: Homer Simpson
##
## [[6]]
## [1] Dr. Julius Hibbert
## Levels: Dr. Julius Hibbert
# Defining Tiles Vector
title <- c("Rev.","Dr.")
# Find out if the title is part of the name
names_wtitle <- ifelse(str_detect(name,title) == TRUE, "YES", "NO")
# Creating a data frame to represent if title is part of the name
names_wtitle <- data.frame (name=name, title=names_wtitle)
names_wtitle
## name title
## 1 Moe Szyslak NO
## 2 Burns, C. Montgomery NO
## 3 Rev. Timothy Lovejoy YES
## 4 Ned Flanders NO
## 5 Simpson, Homer NO
## 6 Dr. Julius Hibbert YES
# Identify if the name has a Middle name. The Regular expressions are generally and initial followed by a period.
middle_name <- ifelse(str_detect(name,"[A-Z]\\.") == TRUE, "YES", "NO")
middle_name
## [1] "NO" "YES" "NO" "NO" "NO" "NO"
# Report next to a name in a data frame.
names_wmname <- data.frame (name, middle=middle_name)
names_wmname
## name middle
## 1 Moe Szyslak NO
## 2 Burns, C. Montgomery YES
## 3 Rev. Timothy Lovejoy NO
## 4 Ned Flanders NO
## 5 Simpson, Homer NO
## 6 Dr. Julius Hibbert NO
This one represent a one digit from [0-9] repeating multiple times to the right at the end of a expression with a dollar ($) sign at the end of the number.
raw.vector <- c("This is my example where 1234567890$ is represented at the end of the vector; it will report [1234567890$] two times with a dollar sign at the end", "Another good example is 123$ but not $123")
unlist(str_extract_all(raw.vector, "[0-9]+\\$"))
## [1] "1234567890$" "1234567890$" "123$"
In this example it will display all the words that are surrounded by edges \b on both sides and composed of four letters or less but with LOWER CASE only.
raw.vector <- c("In this example it will display all the words that are surrounded by edges \\b on both sides and composed of four letters or less but with LOWER CASE only.")
unlist(str_extract_all(raw.vector, "\\b[a-z]{1,4}\\b"))
## [1] "this" "it" "will" "all" "the" "that" "are" "by" "b" "on"
## [11] "both" "and" "of" "four" "or" "less" "but" "with" "only"
In this example it will display all the sentences that are composed of a period . followed by a word that has the asterisk or star that could be optional and then followed by the extension.txt, for example:
raw.vector <- c("In this example it will display all the sentences that are composed of a period . followed by a word that has the asterisk or star that could be optional and then followed by the extension.txt, for example:", "this is good my.homework.txt", "my.homew*rk.txt is a very good example not included because is not at the end of the sentence.", "but this one is great my.homew*rk.txt")
unlist(str_extract_all(raw.vector, ".*?\\.txt$"))
## [1] "this is good my.homework.txt"
## [2] "but this one is great my.homew*rk.txt"
This is for dates composed of up to two digit month, two digit day and four digit year separated with the slash symbol.
raw.vector <- c("in this example we will extract 02/04/2016 and 08/09/1977","Also we will not be able to obtain 4/15/1879 since it doest nt match the two digit month, neither 02/19/17")
unlist(str_extract_all(raw.vector, "\\d{2}/\\d{2}/\\d{4}"))
## [1] "02/04/2016" "08/09/1977"
After doing some research, this represents the Vector containing strings with any type of HTML tag. The back reference removes the outer HTML tags.
The \1 is doing a recall of <(.+?)> at the end of the regular expression.
raw.vector <- c("<!DOCTYPE html><html><body>Hello World</body></html></html>")
unlist(str_extract_all(raw.vector, "<(.+?)>.+?</\\1>"))
## [1] "<html><body>Hello World</body></html>"
clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr
# Raw vector
raw.vector <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
# I noticed there are some upper case letters and some periods in between, so I run the code for Alphanumeric of lenght 1 with an optional period.
hidden_message <- unlist(str_extract_all(raw.vector, "[[:upper:].?]{1,}"))
hidden_message
## [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "AT" "I" "O" "N" "S"
## [15] "." "Y" "O" "U" "." "A" "R" "E" "." "A" ".S" "U" "P" "E"
## [29] "R" "N" "E" "R" "D"
# Since the periods work as a separator, we can replace them for blank spaces and also we can put all toguether in one sentence.
hidden_message <- str_replace_all(paste(hidden_message, collapse = ''), "[.]", " ")
# Final message
hidden_message
## [1] "CONGRATULATIONS YOU ARE A SUPERNERD"