For this homework the string library is needed to manipulate qualitative data
library(stringr)
Load the given raw data and extract the names into a vector called names
data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
names <- unlist(str_extract_all(data, "[[:alpha:]., ]{2,}"))
names
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
Copy the introductory example. The vector names stores the extracted names.
# utilize grepl function to test for camma
# g
#loop for the length of the vector
for(i in 1:length(names)){
#find if any of the name vector string have a comma
if(grepl(',',names[[i]])==TRUE){
#split the String into two strings using the comma as separator
get_str=unlist(str_split(names[[i]],","))
#Swap the two string; then, join them into one string with one-character space in between
names[[i]]=str_c(get_str[2]," ",get_str[1])
}
}
names
## [1] "Moe Szyslak" " C. Montgomery Burns" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" " Homer Simpson" "Dr. Julius Hibbert"
The grepl function will be used to indicate whether character has a title.
# utilize grepl function to logically detect Rev. OR Dr.
title <-grepl("Rev.|Dr.",names)
#create the logical vector
names_title <- data.frame(names,title)
names_title
## names title
## 1 Moe Szyslak FALSE
## 2 C. Montgomery Burns FALSE
## 3 Rev. Timothy Lovejoy TRUE
## 4 Ned Flanders FALSE
## 5 Homer Simpson FALSE
## 6 Dr. Julius Hibbert TRUE
mid_name <- str_detect(names,"[[:upper:]]\\.")
#create the logical vector
name.mid_name <- data.frame(names,mid_name)
name.mid_name
## names mid_name
## 1 Moe Szyslak FALSE
## 2 C. Montgomery Burns TRUE
## 3 Rev. Timothy Lovejoy FALSE
## 4 Ned Flanders FALSE
## 5 Homer Simpson FALSE
## 6 Dr. Julius Hibbert FALSE
Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.
It could be conformed as an alphanumeric. The pattern implies a query to extracting all numbers that end with character $
mystring <- "the cost of two t-shirt is 50$"
str_extract(mystring, "[0-9]+\\$")
## [1] "50$"
It conforms to a lower alphabetic string. The pattern detects a one word that have 1 to 4 lower case characters within a string.
mystring=c("can","fatime","of","F","f","wxsvyz","with","abc popo 12c","d12c")
str_extract(mystring,"\\b[a-z]{1,4}\\b")
## [1] "can" NA "of" NA "f" NA "with" "abc" NA
str_detect(mystring,"\\b[a-z]{1,4}\\b")
## [1] TRUE FALSE TRUE FALSE TRUE FALSE TRUE TRUE FALSE
C. .*?\.txt$
It conforms to graphical string. The pattern detects any string that end with .txt and the string it contain only the .txt file name.
mystring=c("ali.txt","123&.txt"," the file has a.txt extension"," names.txt","alex.ipt")
str_extract(mystring,".*?\\.txt$")
## [1] "ali.txt" "123&.txt" NA " names.txt" NA
str_detect(mystring,".*?\\.txt$")
## [1] TRUE TRUE FALSE TRUE FALSE
It conforms to numbers and punctuation string. The pattern detects any string that have the format of dd/mm/yyyy
mystring <- c("02/15/2000","02-15-2000","born in 02/15/1975")
str_extract(mystring,"\\d{2}/\\d{2}/\\d{4}")
## [1] "02/15/2000" NA "02/15/1975"
str_detect(mystring,"\\d{2}/\\d{2}/\\d{4}")
## [1] TRUE FALSE TRUE
It conforms to graphical string or tag format string. The pattern detects and identifies any three different fields within the string that have the html tag format
mystring <- c("<html> Alex </html>","<xyz> some text</xyz>"," abc <xyz> some text</xyz>")
str_extract(mystring,"<(.+?)>.+?</\\1>")
## [1] "<html> Alex </html>" "<xyz> some text</xyz>" "<xyz> some text</xyz>"
str_detect(mystring,"<(.+?)>.+?</\\1>")
## [1] TRUE TRUE TRUE
The following code hides a secret message. Crack it with R and regular expressions.Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com.
clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr
There are many approaches and much simpler than the one I have to find the secret, but I choose my own solution.
mystring <-"clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
split_mystring=unlist(str_split(mystring,"\\."))
secret_message=""
for(i in 1:length(split_mystring)){
get_secret_words<-unlist(str_extract_all(split_mystring[i], "[:upper:]"))
get_secret_char=""
for(j in 1: length(get_secret_words))
{
get_secret_char <- str_c(get_secret_char,get_secret_words[j])
}
secret_message <- str_c(secret_message," ", get_secret_char)
}
secret_message
## [1] " CONGRATULATIONS YOU ARE A SUPERNERD"