607 Week 3 HW KLS

Problem 3.1

#install.packages("stringr")
#install.packages("XML")
library(stringr)
library(XML)

name.phone.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"

names <- unlist(str_extract_all(name.phone.data,"[[:alpha:]., ]{2,}"))
names

## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

#rearrange to first name, last name
names <- str_replace(names, pattern = "C\\.", replacement = "")
names <- gsub("  ", " ", names)
names

## [1] "Moe Szyslak"          "Burns, Montgomery"    "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

rearranged <- sub("(\\w+),\\s(\\w+)","\\2 \\1", names)
rearranged

## [1] "Moe Szyslak"          "Montgomery Burns"     "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Homer Simpson"        "Dr. Julius Hibbert"

#remove periods to get rid of it after burns
rearranged <- str_replace(rearranged, pattern = "\\.", replacement = "")
#add it back into rev title
rearranged <- str_replace(rearranged, pattern = "Rev", replacement = "Rev\\.")
#add it back into dr title
rearranged <- str_replace(rearranged, pattern = "Dr", replacement = "Dr\\.")
rearranged

## [1] "Moe Szyslak"          "Montgomery Burns"     "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Homer Simpson"        "Dr. Julius Hibbert"

notitle <- str_replace(rearranged, pattern = "Rev", replacement = "")
notitle <- str_replace(notitle, pattern = "Dr", replacement = "")
notitle <- str_replace(notitle, pattern = "\\.", replacement = "")
notitle <- str_trim(notitle)
notitle

## [1] "Moe Szyslak"      "Montgomery Burns" "Timothy Lovejoy" 
## [4] "Ned Flanders"     "Homer Simpson"    "Julius Hibbert"

Problem 3.2

#check for dr
dr <- str_detect(rearranged, "Dr\\.")
dr

## [1] FALSE FALSE FALSE FALSE FALSE  TRUE

#check for rev
reverend <- str_detect(rearranged, "Rev\\.")
reverend

## [1] FALSE FALSE  TRUE FALSE FALSE FALSE

#check for both at once

drandrev <- str_detect(rearranged, "Dr\\.|Rev\\.")
drandrev

## [1] FALSE FALSE  TRUE FALSE FALSE  TRUE

Problem 3.3

#Gives a total count of names with 2 spaces - people with two names
spacecount <- gsub(' {2,}',' ',notitle)
length(strsplit(spacecount, ' ')[[6]])

## [1] 2

#Gives individual space counts per name, then give boolean answer
twonames <- str_count(notitle,"\\w+")
twonames <- str_detect(notitle, "3")
twonames

## [1] FALSE FALSE FALSE FALSE FALSE FALSE

#Gives a total count of names with 1 space - people with one name
spacecount <- gsub(' {1,}',' ',notitle)
length(strsplit(spacecount, ' ')[[6]])

## [1] 2

Problem 4a

[0-9]+\$ This is a string with any number of integers between zero and 9, with a dollar sign immediately following. The dollar sign does not have to be the end of the string.

foura <- c("607$", "sixohsevenisgreat$", "607isgreat", "607607$**", "607607**$")
foura <- str_extract(foura, "[0-9]+\\$")
foura

## [1] "607$"    NA        NA        "607607$" NA

Problem 4b

\b[a-z]{1,4}\b This is a string with between 1 and 4 (inclusive) small alphabetical characters, surrounded by a single space on both sides.

fourb <- c(" 607 ", " sixohseven ", " six ", " oh ", " seven ")
fourb <- str_extract(fourb, "\\b[a-z]{1,4}\\b")
fourb

## [1] NA    NA    "six" "oh"  NA

Problem 4c

.*?\.txt$ This is a string that ends in .txt matching at least zero times and at most one time.

fourc <- c("607.txt", "sixohseven.txt ", "sixtxt.xls")
fourc <- str_extract(fourc, ".*?\\.txt$")
fourc

## [1] "607.txt" NA        NA

Problem 4d

\d{2}/\d{2}/\d{4} This string is two integer values between zero and nine, forward slash, two integer values between zero and nine, forward slash, four integer values between zero and nine.

fourd <- c("02/15/2017", "607/607/607", "60/7607/60 ", "hi/607/hi", "60/76/6076")
fourd <- str_extract(fourd, "\\d{2}/\\d{2}/\\d{4}")
fourd

## [1] "02/15/2017" NA           NA           NA           "60/76/6076"

Problem 4e

<(.+?)>.+?</\1> This string is any properly formatted HTML tag.

foure <- c("<b>Hi</b>", "<i>Hello<i>", "<u>Goodbye<u/>", "<u>Goodbye</u>")
foure <- str_extract(foure, "<(.+?)>.+?</\\1>")
foure

## [1] "<b>Hi</b>"      NA               NA               "<u>Goodbye</u>"

Problem 9

secret.code <- c("clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr")

#Let's see if there's a secret code in the lower letters
secret.code.lower <- str_extract_all(secret.code,"[:lower:]")
secret.code.lower

## [[1]]
##   [1] "c" "l" "c" "o" "p" "o" "w" "z" "m" "s" "t" "c" "d" "w" "n" "k" "i"
##  [18] "g" "v" "d" "i" "c" "p" "u" "g" "g" "v" "h" "r" "y" "n" "j" "u" "w"
##  [35] "c" "z" "i" "h" "q" "r" "f" "p" "x" "s" "j" "d" "w" "p" "n" "a" "n"
##  [52] "w" "o" "w" "i" "s" "d" "i" "j" "j" "k" "p" "f" "d" "r" "c" "o" "c"
##  [69] "b" "t" "y" "c" "z" "j" "a" "t" "a" "o" "o" "t" "j" "t" "j" "n" "e"
##  [86] "c" "f" "e" "k" "r" "w" "w" "w" "o" "j" "i" "g" "d" "v" "r" "f" "r"
## [103] "b" "z" "b" "k" "n" "b" "h" "z" "g" "v" "i" "z" "c" "r" "o" "p" "w"
## [120] "g" "n" "b" "q" "o" "f" "a" "o" "t" "f" "b" "w" "m" "k" "t" "s" "z"
## [137] "q" "e" "f" "y" "n" "d" "t" "k" "c" "f" "g" "m" "c" "g" "x" "o" "n"
## [154] "h" "k" "g" "r"

#Doesn't look like it. Let's try the uppers
secret.code.upper <- str_extract_all(secret.code,"[:upper:]")
secret.code.upper

## [[1]]
##  [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "Y" "O"
## [18] "U" "A" "R" "E" "A" "S" "U" "P" "E" "R" "N" "E" "R" "D"