options(width = 100)
# This is a standard setup I include so that my working
# directory is set correctly whether I work on one of my
# windows or linux machines.
if (Sys.info()["sysname"] == "Windows") {
setwd("~/Masters/DATA607/Week3/Assignment")
} else {
setwd("~/Documents/Masters/DATA607/Week3/Assignment")
}
library(stringr)
raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name_df <- data.frame(name)
colnames(name_df) <- ("original")
name_df
## original
## 1 Moe Szyslak
## 2 Burns, C. Montgomery
## 3 Rev. Timothy Lovejoy
## 4 Ned Flanders
## 5 Simpson, Homer
## 6 Dr. Julius Hibbert
has_comma <- function(x) {
has_comma <- str_detect(string = x, pattern = ",")
}
convert_names <- function(x) {
# This match would not work if a title was after the comma
# (e.g. last_name, title first_name) but the data set in
# quesion does not have this issue
matched_string <- str_match(x, "([[:alpha:]]+\\.\\s)?([[:alpha:]]+),?\\s([[:alpha:]]+\\.?\\s?)([[:alpha:]]+\\.?\\s?)?")
last_name <- ifelse(has_comma(x) == TRUE, matched_string[3],
matched_string[4])
first_name <- ifelse(has_comma(x) == TRUE, matched_string[4],
matched_string[3])
second_name <- ifelse(has_comma(x) == TRUE, matched_string[5],
matched_string[5])
title <- ifelse(has_comma(x) == TRUE, matched_string[2],
matched_string[2])
first_and_last_name <- paste(first_name, last_name, sep = " ")
x <- c(last_name, first_name, second_name, title, first_and_last_name)
}
name_conv <- sapply(name_df$original, function(x) {
convert_names(x)
})
name_df <- cbind(name_df, name_conv[1, ], name_conv[2, ], name_conv[3,
], name_conv[4, ], name_conv[5, ])
colnames(name_df)[2:6] <- c("last_name", "first_name", "second_name",
"title", "first_and_last_name")
name_df
## original last_name first_name second_name title first_and_last_name
## 1 Moe Szyslak Szyslak Moe <NA> <NA> Moe Szyslak
## 2 Burns, C. Montgomery Burns C. Montgomery <NA> C. Burns
## 3 Rev. Timothy Lovejoy Lovejoy Timothy <NA> Rev. Timothy Lovejoy
## 4 Ned Flanders Flanders Ned <NA> <NA> Ned Flanders
## 5 Simpson, Homer Simpson Homer <NA> <NA> Homer Simpson
## 6 Dr. Julius Hibbert Hibbert Julius <NA> Dr. Julius Hibbert
as.character(name_df$first_and_last_name)
## [1] "Moe Szyslak" "C. Burns" "Timothy Lovejoy" "Ned Flanders" "Homer Simpson"
## [6] "Julius Hibbert"
name_df$title_vector <- str_detect(string = name_df$original,
pattern = "^[[:alpha:]]+[.]")
name_df
## original last_name first_name second_name title first_and_last_name title_vector
## 1 Moe Szyslak Szyslak Moe <NA> <NA> Moe Szyslak FALSE
## 2 Burns, C. Montgomery Burns C. Montgomery <NA> C. Burns FALSE
## 3 Rev. Timothy Lovejoy Lovejoy Timothy <NA> Rev. Timothy Lovejoy TRUE
## 4 Ned Flanders Flanders Ned <NA> <NA> Ned Flanders FALSE
## 5 Simpson, Homer Simpson Homer <NA> <NA> Homer Simpson FALSE
## 6 Dr. Julius Hibbert Hibbert Julius <NA> Dr. Julius Hibbert TRUE
name_df$title_vector
## [1] FALSE FALSE TRUE FALSE FALSE TRUE
name_df$second_name_test <- str_detect(name_df$original, pattern = "[^[:alpha:]][[:alpha:]]\\.\\s[[:alpha:]]+")
name_df
## original last_name first_name second_name title first_and_last_name title_vector
## 1 Moe Szyslak Szyslak Moe <NA> <NA> Moe Szyslak FALSE
## 2 Burns, C. Montgomery Burns C. Montgomery <NA> C. Burns FALSE
## 3 Rev. Timothy Lovejoy Lovejoy Timothy <NA> Rev. Timothy Lovejoy TRUE
## 4 Ned Flanders Flanders Ned <NA> <NA> Ned Flanders FALSE
## 5 Simpson, Homer Simpson Homer <NA> <NA> Homer Simpson FALSE
## 6 Dr. Julius Hibbert Hibbert Julius <NA> Dr. Julius Hibbert TRUE
## second_name_test
## 1 FALSE
## 2 TRUE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
name_df$second_name_test
## [1] FALSE TRUE FALSE FALSE FALSE FALSE
This regular expression matches one or more numbers that are ended with a $
problem_4_1_test_string <- c("453534$", "34556456")
unlist(str_match_all(problem_4_1_test_string, "[0-9]+\\$"))
## [1] "453534$"
This regular expression matches any lowercase words (alpha characters) that are between one and four letters long.
problem_4_2_test_string <- c("one", "two", "three", "four")
unlist(str_match_all(problem_4_2_test_string, "\\b[a-z]{1,4}\\b"))
## [1] "one" "two" "four"
This regular expression matches any text (at most once) preceding the letters “.txt”, which must be at the end of the word.
problem_4_3_test_string <- c("txt", "file.txt", "file.csv")
unlist(str_match_all(problem_4_3_test_string, ".*?\\.txt$"))
## [1] "file.txt"
This regular expression matches any strings in date formate mm/dd/yyyy (or dd/mm/yyyy).
problem_4_4_test_string <- c("12/25/2016", "1/1/2017")
unlist(str_match_all(problem_4_4_test_string, "\\d{2}/\\d{2}/\\d{4}"))
## [1] "12/25/2016"
This regular expression returns any text encased within html tags, which start with a “
problem_4_5_test_string <- c("<p> This text is black </p>", "<p> color=black> No match </p>")
unlist(str_match_all(problem_4_5_test_string, "<(.+?)>.+?</\\1>"))
## [1] "<p> This text is black </p>" "p"
## [3] "<p> color=black> No match </p>" "p"
secret_message <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
message <- cat(unlist(str_extract_all(secret_message, "([A-Z]+|[[:punct:]]){1,}")),
sep = "")
## CONGRATULATIONS.YOU.ARE.A.SUPERNERD!