Week 3 Assignment

1. Use the tools of this chapter to rearrange the vector sot hat all elements conform to the standard first_name last_name

library('stringr')

## Warning: package 'stringr' was built under R version 3.3.3

library('tidyr')

## Warning: package 'tidyr' was built under R version 3.3.3

raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson,Homer5553642Dr. Julius Hibbert"

name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name

## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson,Homer"        "Dr. Julius Hibbert"

phone <- unlist(str_extract_all(raw.data, "\\(?(\\d{3})?\\)?(-| )?\\d{3}(-| )?\\d{4}"))

# removes the prefix from anyone who has one
pop.prefix <- str_trim(sub("[[:alpha:]]{1,}\\.", "" ,name))
pop.prefix

## [1] "Moe Szyslak"        "Burns,  Montgomery" "Timothy Lovejoy"   
## [4] "Ned Flanders"       "Simpson,Homer"      "Julius Hibbert"

# To switch the names 
fix.homer <-  sub('^(.*),Homer', 'Homer \\1', pop.prefix) 

fix.burns <- sub('^(.*),  Montgomery', 'Montgomery \\1', fix.homer)

clean.names <- fix.burns

df2 <- data.frame(Name=clean.names, Phone=phone)
df2

##               Name          Phone
## 1      Moe Szyslak       555-1239
## 2 Montgomery Burns (636) 555-0113
## 3  Timothy Lovejoy       555-6542
## 4     Ned Flanders       555 8904
## 5    Homer Simpson   636-555-3226
## 6   Julius Hibbert        5553642

# Splits the name column to first, last name 
extract(df2, Name, into=c('FirstName', 'LastName'), '(.*)\\s+([^ ]+)$')

##    FirstName LastName          Phone
## 1        Moe  Szyslak       555-1239
## 2 Montgomery    Burns (636) 555-0113
## 3    Timothy  Lovejoy       555-6542
## 4        Ned Flanders       555 8904
## 5      Homer  Simpson   636-555-3226
## 6     Julius  Hibbert        5553642

(b) Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).

prefix <- str_detect(name,"[A-Za-z]{2,}\\.")
prefix

## [1] FALSE FALSE  TRUE FALSE FALSE  TRUE

#Applying same regex as before but including the "C." in Montgomery Burns
fix1 <-  sub('^(.*),Homer', 'Homer \\1', name) 

fix2 <- sub('^(.*), C. Montgomery', 'C. Montgomery \\1', fix1)

data.frame(Name=fix2, Title=prefix)

##                   Name Title
## 1          Moe Szyslak FALSE
## 2  C. Montgomery Burns FALSE
## 3 Rev. Timothy Lovejoy  TRUE
## 4         Ned Flanders FALSE
## 5        Homer Simpson FALSE
## 6   Dr. Julius Hibbert  TRUE

(c) Construct a logical vector indicating whether a character has a second name.

# I changed the params to only pop "Rev." and "Dr."
pop.prefix2 <- str_trim(sub("[[:alpha:]]{2,}\\.", "" ,fix2))

find_middle <- str_detect(pop.prefix2,"\\s[A-Za-z]{2,}\\ ")

df <- data.frame(Name=pop.prefix2, Middle_Name=find_middle)

df

##                  Name Middle_Name
## 1         Moe Szyslak       FALSE
## 2 C. Montgomery Burns        TRUE
## 3     Timothy Lovejoy       FALSE
## 4        Ned Flanders       FALSE
## 5       Homer Simpson       FALSE
## 6      Julius Hibbert       FALSE

Describe the types of strings that conform to the following regular expressions and construst an example that is matched by the regular expression. a. [0-9]+$

# Searches for 1 or more strings that contain any number of digits followed by a $
test <- c("asl1$kfda$2309$laksjf$al;kjafd39444$")
str_extract_all(test,"[0-9]+\\$")

## [[1]]
## [1] "1$"     "2309$"  "39444$"

# Searches for lower case letters between lengths 1-4 and in beween spaces
test <- c("23h2a lets w9sja2 get ao38s an 23oas a")
str_extract_all(test, "\\b[a-z]{1,4}\\b")

## [[1]]
## [1] "lets" "get"  "an"   "a"

.*?.txt$

# Searches for any charachter for any length for 0 or 1 rep that precedes a ".txt""
test <- c("test.pdf", "test.jpg", "test.txt")
str_extract_all(test, ".*?\\.txt$")

## [[1]]
## character(0)
## 
## [[2]]
## character(0)
## 
## [[3]]
## [1] "test.txt"

# Searches for 2/2/4 digits like MM/DD/YYYY
test <- c("2/26/92", "02/26/92", "02/26/1492")
str_extract_all(test, "\\d{2}/\\d{2}/\\d{4}")

## [[1]]
## character(0)
## 
## [[2]]
## character(0)
## 
## [[3]]
## [1] "02/26/1492"

<(.+?)>.+?</>

# This is looking for html tags and any length of any characher between them.
# The \1 is back referencing the set of regex before the ?, which denotes end of string
test <- c("<span>homeruns</span>", "<class>Hello!</class>", "<where300/?")
str_extract_all(test, "<(.+?)>.+?</\\1>")

## [[1]]
## [1] "<span>homeruns</span>"
## 
## [[2]]
## [1] "<class>Hello!</class>"
## 
## [[3]]
## character(0)

The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com.

jumble <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk! gr"

caps <- unlist(str_extract_all(jumble, "[[:upper:]]"))
str_c(caps, collapse = "")

## [1] "CONGRATULATIONSYOUAREASUPERNERD"

Week 3 Assignment

Michael D’Acampora

September 15, 2017

1. Use the tools of this chapter to rearrange the vector sot hat all elements conform to the standard first_name last_name

(b) Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).

(c) Construct a logical vector indicating whether a character has a second name.