DATA 607 HW3

Chapter 8 Problem 3

library("stringr")

## Warning: package 'stringr' was built under R version 3.4.1

# Load the raw data while we are at it.
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
raw.data

## [1] "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"

Copy the introductory example. The vector name stores the extracted names.

names <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
names

## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

1. Use the tools of this chapter to rearrange the vector so that all the elements conform to the standard first_name last_name.

The first regex I use “,.+$” identifies a string of one or more characters between a comma and the end of the string. This is to pull Monte Burns and Homer Simpson. Next is to extract a string of one or more characters between a period and a space, “\..+”, this is for Rev. Lovejoy and Dr. Hibbert. Finally for Moe and Ned, the regex extracts exactly 3 characters between the start of the string and a space with no period, “^.[^\\.]{2}”.

For the last names there were two patterns, the last name ended in a comma in the case of Homer and Monte, “.+,” or the last name was 7 or 8 characters between a space and the end of the string for everyone else, “[[:blank:]].{7,8}”.

I used for loops to take out unwanted spaces and punctuation marks.

first_name <- str_extract_all(names, ",.+$|\\..+ |^.[^\\.]{2} ")
# Clean up the data a little by getting rid of punctuation and leading spaces
for(i in 1:length(first_name)){
  first_name[i] = gsub(", ", "", first_name[i])
  first_name[i] = gsub("\\. ", "", first_name[i])
  first_name[i] = gsub(" ", "", first_name[i])
# Instead of dropping the C. from Monte's name. I opted to keep it in, as I have a friend that does this on his social media and uses it as his pen name.
  first_name[i] = gsub("CM", "C. M", first_name[i])
}

last_name <- str_extract_all(names, ".+,|[ [:blank:]].{7,8}$")
for(i in 1:length(last_name)){
  last_name[i] = gsub(",", "", last_name[i])
}

names_df <-data.frame(name = paste(first_name,last_name))
names_df

##                  name
## 1        Moe  Szyslak
## 2 C. Montgomery Burns
## 3    Timothy  Lovejoy
## 4       Ned  Flanders
## 5       Homer Simpson
## 6     Julius  Hibbert

(2) Create a logical vector to indicate if the character has a title.

The titles are either 2 or 3 letters long and are listed first, so I used srt_detect to start at the start of the string and look for three or 2 characters between a period, “^.{3}\.”.|^.{2}\.“.

title <- str_detect(names, "^.{3}\\.|^.{2}\\.")
names_df <- cbind(names_df, title = title)
names_df

##                  name title
## 1        Moe  Szyslak FALSE
## 2 C. Montgomery Burns FALSE
## 3    Timothy  Lovejoy  TRUE
## 4       Ned  Flanders FALSE
## 5       Homer Simpson FALSE
## 6     Julius  Hibbert  TRUE

(3) Create a logical vector indicacting whether a character has a second name.

What makes Monte Burns different is his name is the only one with a comma and a period, so I used str_detect to look for strings with at least one character between a comma and period “,.+\.”.

second <- str_detect(names, ",.+\\.")
names_df <- cbind(names_df, scnd_nm = second)
names_df

##                  name title scnd_nm
## 1        Moe  Szyslak FALSE   FALSE
## 2 C. Montgomery Burns FALSE    TRUE
## 3    Timothy  Lovejoy  TRUE   FALSE
## 4       Ned  Flanders FALSE   FALSE
## 5       Homer Simpson FALSE   FALSE
## 6     Julius  Hibbert  TRUE   FALSE

Chapter 8 Problem 4

test_list <- as.character(c("5$" , "$5", "five$", "7,134,689$","at", "dab", "dAb" , "DaB" ,"band", "banana", "homework", 'homework.doc', "homework.txt", "txt", ".txt" , "7/4/76", "07/04/76", "07/04/1776", "07-04-1776", "<link>rpubs.com</link>", "<bold>Got It!</link>"))
test_list

##  [1] "5$"                     "$5"                    
##  [3] "five$"                  "7,134,689$"            
##  [5] "at"                     "dab"                   
##  [7] "dAb"                    "DaB"                   
##  [9] "band"                   "banana"                
## [11] "homework"               "homework.doc"          
## [13] "homework.txt"           "txt"                   
## [15] ".txt"                   "7/4/76"                
## [17] "07/04/76"               "07/04/1776"            
## [19] "07-04-1776"             "<link>rpubs.com</link>"
## [21] "<bold>Got It!</link>"

Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.

[0-9]+\$ one or more digits 0 to 9 followed by a dollar sign.

dollar <- str_detect(test_list, "[0-9]+\\$")
dollar <- cbind(test_list,dollar)
dollar

##       test_list                dollar 
##  [1,] "5$"                     "TRUE" 
##  [2,] "$5"                     "FALSE"
##  [3,] "five$"                  "FALSE"
##  [4,] "7,134,689$"             "TRUE" 
##  [5,] "at"                     "FALSE"
##  [6,] "dab"                    "FALSE"
##  [7,] "dAb"                    "FALSE"
##  [8,] "DaB"                    "FALSE"
##  [9,] "band"                   "FALSE"
## [10,] "banana"                 "FALSE"
## [11,] "homework"               "FALSE"
## [12,] "homework.doc"           "FALSE"
## [13,] "homework.txt"           "FALSE"
## [14,] "txt"                    "FALSE"
## [15,] ".txt"                   "FALSE"
## [16,] "7/4/76"                 "FALSE"
## [17,] "07/04/76"               "FALSE"
## [18,] "07/04/1776"             "FALSE"
## [19,] "07-04-1776"             "FALSE"
## [20,] "<link>rpubs.com</link>" "FALSE"
## [21,] "<bold>Got It!</link>"   "FALSE"

\b[a-z]{1,4}\b detects if a string contains 1 to 4 lower case letters bordered by a non-word character.

words <- str_detect(test_list, "\\b[a-z]{1,4}\\b")
words <- cbind(test_list,words)
words

##       test_list                words  
##  [1,] "5$"                     "FALSE"
##  [2,] "$5"                     "FALSE"
##  [3,] "five$"                  "TRUE" 
##  [4,] "7,134,689$"             "FALSE"
##  [5,] "at"                     "TRUE" 
##  [6,] "dab"                    "TRUE" 
##  [7,] "dAb"                    "FALSE"
##  [8,] "DaB"                    "FALSE"
##  [9,] "band"                   "TRUE" 
## [10,] "banana"                 "FALSE"
## [11,] "homework"               "FALSE"
## [12,] "homework.doc"           "TRUE" 
## [13,] "homework.txt"           "TRUE" 
## [14,] "txt"                    "TRUE" 
## [15,] ".txt"                   "TRUE" 
## [16,] "7/4/76"                 "FALSE"
## [17,] "07/04/76"               "FALSE"
## [18,] "07/04/1776"             "FALSE"
## [19,] "07-04-1776"             "FALSE"
## [20,] "<link>rpubs.com</link>" "TRUE" 
## [21,] "<bold>Got It!</link>"   "TRUE"

.*?\.txt$ checks the end of a string for a .txt that may or may not have characters before the “.”.

txt <- str_detect(test_list, ".*?\\.txt?")
txt <- cbind(test_list,txt)
txt

##       test_list                txt    
##  [1,] "5$"                     "FALSE"
##  [2,] "$5"                     "FALSE"
##  [3,] "five$"                  "FALSE"
##  [4,] "7,134,689$"             "FALSE"
##  [5,] "at"                     "FALSE"
##  [6,] "dab"                    "FALSE"
##  [7,] "dAb"                    "FALSE"
##  [8,] "DaB"                    "FALSE"
##  [9,] "band"                   "FALSE"
## [10,] "banana"                 "FALSE"
## [11,] "homework"               "FALSE"
## [12,] "homework.doc"           "FALSE"
## [13,] "homework.txt"           "TRUE" 
## [14,] "txt"                    "FALSE"
## [15,] ".txt"                   "TRUE" 
## [16,] "7/4/76"                 "FALSE"
## [17,] "07/04/76"               "FALSE"
## [18,] "07/04/1776"             "FALSE"
## [19,] "07-04-1776"             "FALSE"
## [20,] "<link>rpubs.com</link>" "FALSE"
## [21,] "<bold>Got It!</link>"   "FALSE"

\d{2}/\d{2}//d{4} detects exactly 2 digits then a then exactly two digits then a then exactly 4 digits. This checks for a formatted dd\mm\yyyy or mm\dd\yyyy date depending if you are in the US or not.

dates <- str_detect(test_list, "\\d{2}/\\d{2}/\\d{4}")
dates <- cbind(test_list,dates)
dates

##       test_list                dates  
##  [1,] "5$"                     "FALSE"
##  [2,] "$5"                     "FALSE"
##  [3,] "five$"                  "FALSE"
##  [4,] "7,134,689$"             "FALSE"
##  [5,] "at"                     "FALSE"
##  [6,] "dab"                    "FALSE"
##  [7,] "dAb"                    "FALSE"
##  [8,] "DaB"                    "FALSE"
##  [9,] "band"                   "FALSE"
## [10,] "banana"                 "FALSE"
## [11,] "homework"               "FALSE"
## [12,] "homework.doc"           "FALSE"
## [13,] "homework.txt"           "FALSE"
## [14,] "txt"                    "FALSE"
## [15,] ".txt"                   "FALSE"
## [16,] "7/4/76"                 "FALSE"
## [17,] "07/04/76"               "FALSE"
## [18,] "07/04/1776"             "TRUE" 
## [19,] "07-04-1776"             "FALSE"
## [20,] "<link>rpubs.com</link>" "FALSE"
## [21,] "<bold>Got It!</link>"   "FALSE"

<(.+?)>.+?</\1> this checks for html tags by capturing a string between < > that may or may not have a string between and matching it to a </ > with the same captured string in the braket.

html <- str_detect(test_list, "<(.+?)>.+?</\\1>")
html <- cbind(test_list,html)
html

##       test_list                html   
##  [1,] "5$"                     "FALSE"
##  [2,] "$5"                     "FALSE"
##  [3,] "five$"                  "FALSE"
##  [4,] "7,134,689$"             "FALSE"
##  [5,] "at"                     "FALSE"
##  [6,] "dab"                    "FALSE"
##  [7,] "dAb"                    "FALSE"
##  [8,] "DaB"                    "FALSE"
##  [9,] "band"                   "FALSE"
## [10,] "banana"                 "FALSE"
## [11,] "homework"               "FALSE"
## [12,] "homework.doc"           "FALSE"
## [13,] "homework.txt"           "FALSE"
## [14,] "txt"                    "FALSE"
## [15,] ".txt"                   "FALSE"
## [16,] "7/4/76"                 "FALSE"
## [17,] "07/04/76"               "FALSE"
## [18,] "07/04/1776"             "FALSE"
## [19,] "07-04-1776"             "FALSE"
## [20,] "<link>rpubs.com</link>" "TRUE" 
## [21,] "<bold>Got It!</link>"   "FALSE"