Overview of the dataset
library(stringr)
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
head(raw.data)
## [1] "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
str(raw.data)
## chr "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-32"| __truncated__
Exercise 3.1
- Making the replacement function as a helper
multireplace <- function(fname, w) {
fnames <-unlist(str_replace(fname, pattern = w,
replacement = ""))
return(fnames)
}
names <- multireplace(name,"C. ")
names <- multireplace(names,"Rev. ")
names <- multireplace(names,"Dr. ")
names
## [1] "Moe Szyslak" "Burns, Montgomery" "Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Julius Hibbert"
- Looking for names with commas and switching
first_name <-c()
last_name <- c()
k = 1
for (n in names) {
if (str_detect(names, "[A-Z]\\, +")[k]){
f.l.name <- unlist(str_split(n, ","))
first_name <- c(first_name, f.l.name[2])
last_name <- c(last_name, f.l.name[1])
}
f.l.name <- unlist(str_split(n, " "))
first_name <- c(first_name, f.l.name[1])
last_name <- c(last_name, f.l.name[2])
k = k+1
}
first_name
## [1] "Moe" "Burns," "Timothy" "Ned" "Simpson," "Julius"
last_name
## [1] "Szyslak" "Montgomery" "Lovejoy" "Flanders" "Homer"
## [6] "Hibbert"
first_name <- multireplace(first_name,",")
tname <- data.frame(first_name, last_name)
tname
## first_name last_name
## 1 Moe Szyslak
## 2 Burns Montgomery
## 3 Timothy Lovejoy
## 4 Ned Flanders
## 5 Simpson Homer
## 6 Julius Hibbert
Exercise 3.2 Looking for name starting with title
nametitle <- multireplace(name, "C.")
hastitle <-str_detect(nametitle, "\\b[.]")
hastitle
## [1] FALSE FALSE TRUE FALSE FALSE TRUE
Exercise 4 using regular expression
Exercise 4.1.
- This regular expression return all number between 0 and 9 repeated 1 or more times
- and ending with dollar sign
compta <- c("try$87/68/89$","abo$2670000note","bretail$" ,"0890463$76575759$", "Roc$292mjk0099$" )
amount <-unlist(str_extract_all(compta, "[0-9]+\\$"))
amount
## [1] "89$" "0890463$" "76575759$" "0099$"
Exercise 4.2.
- This regular expression return cheracters between a and z (lower case)
- with a mim of 1 and a max of 4 occurence, at the begining and the end the
- world.
note <- unlist(str_extract_all(compta, "\\b[a-z]{1,4}\\b"))
note
## [1] "try" "abo"
Exercise 4.3.
- This regular expression return a text with or without characters,
- and with extension txt
textmode <- c("average.txt", "python.py", "generalmotor.tx", ".txt")
nofile <- unlist(str_extract_all(textmode, ".*?\\.txt$"))
nofile
## [1] "average.txt" ".txt"
Exercise 4.4.This regular expresion extact the date in the format
mm/dd/yyyy
date <- c("09/25/1967, 05/25/1968d10/02/1980m12/27/2009")
dates <-unlist(str_extract_all(date, "\\d{2}/\\d{2}/\\d{4}"))
dates
## [1] "09/25/1967" "05/25/1968" "10/02/1980" "12/27/2009"
Exercise 9
cod <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj
5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.
r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7w
Em24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
uncod <-unlist(str_extract_all(cod, "[[:upper:].]{1,}"))
uncod<- paste(uncod, collapse = "")
uncod<- str_replace_all(uncod, pattern = "\\.", replacement = " ")
uncod
## [1] "CONGRATULATIONS YOU ARE A SUPERNERD"