Data607-Assignment3

library("stringr")

Load the raw data while we are at it.

raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
raw.data

## [1] "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"

Copy the introductory example. The vector name stores the extracted names.

names <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
names

## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

1. Use the tools of this chapter to rearrange the vector so that all the elements conform to the standard first_name last_name.

The first regex I use “,.+$” identifies a string of one or more characters between a comma and the end of the string. This is to pull Monte Burns and Homer Simpson. Next is to extract a string of one or more characters between a period and a space, “..+”, this is for Rev. Lovejoy and Dr. Hibbert. Finally for Moe and Ned, the regex extracts exactly 3 characters between the start of the string and a space with no period, “^.[^\\.]{2}”.

For the last names there were two patterns, the last name ended in a comma in the case of Homer and Monte, “.+,” or the last name was 7 or 8 characters between a space and the end of the string for everyone else, “[[:blank:]].{7,8}”.

I used for loops to take out unwanted spaces and punctuation marks.

first_name <- str_extract_all(names, ",.+$|\\..+ |^.[^\\.]{2} ")
# Clean up the data a little by getting rid of punctuation and leading spaces
for(i in 1:length(first_name)){
  first_name[i] = gsub(", ", "", first_name[i])
  first_name[i] = gsub("\\. ", "", first_name[i])
  first_name[i] = gsub(" ", "", first_name[i])
# Instead of dropping the C. from Monte's name. I opted to keep it in, as I have a friend that does this on his social media and uses it as his pen name.
  first_name[i] = gsub("CM", "C. M", first_name[i])
}

last_name <- str_extract_all(names, ".+,|[ [:blank:]].{7,8}$")
for(i in 1:length(last_name)){
  last_name[i] = gsub(",", "", last_name[i])
}

names_df <-data.frame(name = paste(first_name,last_name))
names_df

##                  name
## 1        Moe  Szyslak
## 2 C. Montgomery Burns
## 3    Timothy  Lovejoy
## 4       Ned  Flanders
## 5       Homer Simpson
## 6     Julius  Hibbert

2. Create a logical vector to indicate if the character has a title.

The titles are either 2 or 3 letters long and are listed first, so I used srt_detect to start at the start of the string and look for three or 2 characters between a period, “^.{3}.”.|^.{2}.“.

title <- str_detect(names, "^.{3}\\.|^.{2}\\.")
names_df <- cbind(names_df, title = title)
names_df

##                  name title
## 1        Moe  Szyslak FALSE
## 2 C. Montgomery Burns FALSE
## 3    Timothy  Lovejoy  TRUE
## 4       Ned  Flanders FALSE
## 5       Homer Simpson FALSE
## 6     Julius  Hibbert  TRUE

3.Create a logical vector indicacting whether a character has a second name.

What makes Monte Burns different is his name is the only one with a comma and a period, so I used str_detect to look for strings with at least one character between a comma and period “,.+.”.

second <- str_detect(names, ",.+\\.")
names_df <- cbind(names_df, scnd_nm = second)
names_df

##                  name title scnd_nm
## 1        Moe  Szyslak FALSE   FALSE
## 2 C. Montgomery Burns FALSE    TRUE
## 3    Timothy  Lovejoy  TRUE   FALSE
## 4       Ned  Flanders FALSE   FALSE
## 5       Homer Simpson FALSE   FALSE
## 6     Julius  Hibbert  TRUE   FALSE

Chapter 8 Problem 4

test_list <- as.character(c("5$" , "$5", "five$", "7,134,689$","at", "dab", "dAb" , "DaB" ,"band", "banana", "homework", 'homework.doc', "homework.txt", "txt", ".txt" , "7/4/76", "07/04/76", "07/04/1776", "07-04-1776", "<link>rpubs.com</link>", "<bold>Got It!</link>"))
test_list

##  [1] "5$"                     "$5"                    
##  [3] "five$"                  "7,134,689$"            
##  [5] "at"                     "dab"                   
##  [7] "dAb"                    "DaB"                   
##  [9] "band"                   "banana"                
## [11] "homework"               "homework.doc"          
## [13] "homework.txt"           "txt"                   
## [15] ".txt"                   "7/4/76"                
## [17] "07/04/76"               "07/04/1776"            
## [19] "07-04-1776"             "<link>rpubs.com</link>"
## [21] "<bold>Got It!</link>"

Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.

1.[0-9]+$ one or more digits 0 to 9 followed by a dollar sign.

dollar <- str_detect(test_list, "[0-9]+\\$")
dollar <- cbind(test_list,dollar)
dollar

##       test_list                dollar 
##  [1,] "5$"                     "TRUE" 
##  [2,] "$5"                     "FALSE"
##  [3,] "five$"                  "FALSE"
##  [4,] "7,134,689$"             "TRUE" 
##  [5,] "at"                     "FALSE"
##  [6,] "dab"                    "FALSE"
##  [7,] "dAb"                    "FALSE"
##  [8,] "DaB"                    "FALSE"
##  [9,] "band"                   "FALSE"
## [10,] "banana"                 "FALSE"
## [11,] "homework"               "FALSE"
## [12,] "homework.doc"           "FALSE"
## [13,] "homework.txt"           "FALSE"
## [14,] "txt"                    "FALSE"
## [15,] ".txt"                   "FALSE"
## [16,] "7/4/76"                 "FALSE"
## [17,] "07/04/76"               "FALSE"
## [18,] "07/04/1776"             "FALSE"
## [19,] "07-04-1776"             "FALSE"
## [20,] "<link>rpubs.com</link>" "FALSE"
## [21,] "<bold>Got It!</link>"   "FALSE"

detects if a string contains 1 to 4 lower case letters bordered by a non-word character.

words <- str_detect(test_list, "\\b[a-z]{1,4}\\b")
words <- cbind(test_list,words)
words

##       test_list                words  
##  [1,] "5$"                     "FALSE"
##  [2,] "$5"                     "FALSE"
##  [3,] "five$"                  "TRUE" 
##  [4,] "7,134,689$"             "FALSE"
##  [5,] "at"                     "TRUE" 
##  [6,] "dab"                    "TRUE" 
##  [7,] "dAb"                    "FALSE"
##  [8,] "DaB"                    "FALSE"
##  [9,] "band"                   "TRUE" 
## [10,] "banana"                 "FALSE"
## [11,] "homework"               "FALSE"
## [12,] "homework.doc"           "TRUE" 
## [13,] "homework.txt"           "TRUE" 
## [14,] "txt"                    "TRUE" 
## [15,] ".txt"                   "TRUE" 
## [16,] "7/4/76"                 "FALSE"
## [17,] "07/04/76"               "FALSE"
## [18,] "07/04/1776"             "FALSE"
## [19,] "07-04-1776"             "FALSE"
## [20,] "<link>rpubs.com</link>" "TRUE" 
## [21,] "<bold>Got It!</link>"   "TRUE"

.*?.txt$ checks the end of a string for a .txt that may or may not have characters before the “.”.

txt <- str_detect(test_list, ".*?\\.txt?")
txt <- cbind(test_list,txt)
txt

##       test_list                txt    
##  [1,] "5$"                     "FALSE"
##  [2,] "$5"                     "FALSE"
##  [3,] "five$"                  "FALSE"
##  [4,] "7,134,689$"             "FALSE"
##  [5,] "at"                     "FALSE"
##  [6,] "dab"                    "FALSE"
##  [7,] "dAb"                    "FALSE"
##  [8,] "DaB"                    "FALSE"
##  [9,] "band"                   "FALSE"
## [10,] "banana"                 "FALSE"
## [11,] "homework"               "FALSE"
## [12,] "homework.doc"           "FALSE"
## [13,] "homework.txt"           "TRUE" 
## [14,] "txt"                    "FALSE"
## [15,] ".txt"                   "TRUE" 
## [16,] "7/4/76"                 "FALSE"
## [17,] "07/04/76"               "FALSE"
## [18,] "07/04/1776"             "FALSE"
## [19,] "07-04-1776"             "FALSE"
## [20,] "<link>rpubs.com</link>" "FALSE"
## [21,] "<bold>Got It!</link>"   "FALSE"

///d{4} detects exactly 2 digits then a then exactly two digits then a then exactly 4 digits. This checks for a formatted ddor mmdate depending if you are in the US or not.

dates <- str_detect(test_list, "\\d{2}/\\d{2}/\\d{4}")
dates <- cbind(test_list,dates)
dates

##       test_list                dates  
##  [1,] "5$"                     "FALSE"
##  [2,] "$5"                     "FALSE"
##  [3,] "five$"                  "FALSE"
##  [4,] "7,134,689$"             "FALSE"
##  [5,] "at"                     "FALSE"
##  [6,] "dab"                    "FALSE"
##  [7,] "dAb"                    "FALSE"
##  [8,] "DaB"                    "FALSE"
##  [9,] "band"                   "FALSE"
## [10,] "banana"                 "FALSE"
## [11,] "homework"               "FALSE"
## [12,] "homework.doc"           "FALSE"
## [13,] "homework.txt"           "FALSE"
## [14,] "txt"                    "FALSE"
## [15,] ".txt"                   "FALSE"
## [16,] "7/4/76"                 "FALSE"
## [17,] "07/04/76"               "FALSE"
## [18,] "07/04/1776"             "TRUE" 
## [19,] "07-04-1776"             "FALSE"
## [20,] "<link>rpubs.com</link>" "FALSE"
## [21,] "<bold>Got It!</link>"   "FALSE"

<(.+?)>.+?</> this checks for html tags by capturing a string between < > that may or may not have a string between and matching it to a </ > with the same captured string in the braket.

html <- str_detect(test_list, "<(.+?)>.+?</\\1>")
html <- cbind(test_list,html)
html

##       test_list                html   
##  [1,] "5$"                     "FALSE"
##  [2,] "$5"                     "FALSE"
##  [3,] "five$"                  "FALSE"
##  [4,] "7,134,689$"             "FALSE"
##  [5,] "at"                     "FALSE"
##  [6,] "dab"                    "FALSE"
##  [7,] "dAb"                    "FALSE"
##  [8,] "DaB"                    "FALSE"
##  [9,] "band"                   "FALSE"
## [10,] "banana"                 "FALSE"
## [11,] "homework"               "FALSE"
## [12,] "homework.doc"           "FALSE"
## [13,] "homework.txt"           "FALSE"
## [14,] "txt"                    "FALSE"
## [15,] ".txt"                   "FALSE"
## [16,] "7/4/76"                 "FALSE"
## [17,] "07/04/76"               "FALSE"
## [18,] "07/04/1776"             "FALSE"
## [19,] "07-04-1776"             "FALSE"
## [20,] "<link>rpubs.com</link>" "TRUE" 
## [21,] "<bold>Got It!</link>"   "FALSE"

Extra Credit Problem

regex_code <- c("clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr")

I tried a bunch of stuff then I deciced to look at different letter cases and I noticed this:

upper <- str_extract_all(regex_code ,"[[:upper:]]")
upper

## [[1]]
##  [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "Y" "O"
## [18] "U" "A" "R" "E" "A" "S" "U" "P" "E" "R" "N" "E" "R" "D"