Using Regular Expression

Overview of the dataset

library(stringr)
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
head(raw.data)
## [1] "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
str(raw.data)
##  chr "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-32"| __truncated__

Extracting names

name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name
## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

Exercise 3.1

  • Making the replacement function as a helper
 multireplace <- function(fname, w) {
   fnames <-unlist(str_replace(fname, pattern = w, 
                           replacement = ""))
   return(fnames)
 }
names <- multireplace(name,"C. ")
names <- multireplace(names,"Rev. ")
names <- multireplace(names,"Dr. ")
names
## [1] "Moe Szyslak"       "Burns, Montgomery" "Timothy Lovejoy"  
## [4] "Ned Flanders"      "Simpson, Homer"    "Julius Hibbert"
  • Looking for names with commas and switching
first_name <-c()
last_name <- c()
k = 1
for (n in names) {
  if (str_detect(names, "[A-Z]\\, +")[k]){
    f.l.name <- unlist(str_split(n, ","))
    first_name <- c(first_name, f.l.name[2])
    last_name <- c(last_name, f.l.name[1])
  }
    f.l.name <- unlist(str_split(n, " "))
    first_name <- c(first_name, f.l.name[1])
    last_name <- c(last_name, f.l.name[2])
    k = k+1
}
first_name
## [1] "Moe"      "Burns,"   "Timothy"  "Ned"      "Simpson," "Julius"
last_name
## [1] "Szyslak"    "Montgomery" "Lovejoy"    "Flanders"   "Homer"     
## [6] "Hibbert"
first_name <- multireplace(first_name,",")
tname <- data.frame(first_name, last_name)
tname
##   first_name  last_name
## 1        Moe    Szyslak
## 2      Burns Montgomery
## 3    Timothy    Lovejoy
## 4        Ned   Flanders
## 5    Simpson      Homer
## 6     Julius    Hibbert

Exercise 3.2 Looking for name starting with title

nametitle <- multireplace(name, "C.")
hastitle <-str_detect(nametitle, "\\b[.]")
hastitle
## [1] FALSE FALSE  TRUE FALSE FALSE  TRUE

Exercise 3.2 Poeple with a second name

has2name <- str_detect(name, "[A-Z]\\.")
has2name
## [1] FALSE  TRUE FALSE FALSE FALSE FALSE

Exercise 4 using regular expression

Exercise 4.1.

  • This regular expression return all number between 0 and 9 repeated 1 or more times
  • and ending with dollar sign
compta <- c("try$87/68/89$","abo$2670000note","bretail$" ,"0890463$76575759$", "Roc$292mjk0099$" )

amount <-unlist(str_extract_all(compta, "[0-9]+\\$"))
amount
## [1] "89$"       "0890463$"  "76575759$" "0099$"

Exercise 4.2.

  • This regular expression return cheracters between a and z (lower case)
  • with a mim of 1 and a max of 4 occurence, at the begining and the end the
  • world.
note <- unlist(str_extract_all(compta, "\\b[a-z]{1,4}\\b"))
note
## [1] "try" "abo"

Exercise 4.3.

  • This regular expression return a text with or without characters,
  • and with extension txt
textmode <- c("average.txt", "python.py", "generalmotor.tx", ".txt")

nofile <- unlist(str_extract_all(textmode, ".*?\\.txt$"))
nofile
## [1] "average.txt" ".txt"

Exercise 4.4.This regular expresion extact the date in the format

mm/dd/yyyy

date <- c("09/25/1967, 05/25/1968d10/02/1980m12/27/2009")
dates <-unlist(str_extract_all(date, "\\d{2}/\\d{2}/\\d{4}"))
dates
## [1] "09/25/1967" "05/25/1968" "10/02/1980" "12/27/2009"

Exercise 4.5.This expression extract a text between HTML tags

htmltext <- c("<p>football</p>", "<Solid>", "<>",
"<center>String processing usesnstringr package.</center>")
hnote <- unlist(str_extract_all(htmltext, "<(.+?)>.+?</\\1>"))
hnote
## [1] "<p>football</p>"                                         
## [2] "<center>String processing usesnstringr package.</center>"

Exercise 9

cod <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj
5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.
r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7w
Em24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"

uncod <-unlist(str_extract_all(cod, "[[:upper:].]{1,}"))
uncod<- paste(uncod, collapse = "")
uncod<- str_replace_all(uncod, pattern = "\\.", replacement = " ")
uncod
## [1] "CONGRATULATIONS YOU ARE A SUPERNERD"