Instruction: Please deliver links to an R Markdown file (in GitHub and rpubs.com) with solutions to problems 3 and 4 from chapter 8 of Automated Data Collection in R. Problem 9 is extra credit. You may work in a small group, but please submit separately with names of all group participants in your submission.
library(stringr)
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
phone <- unlist(str_extract_all(raw.data, "\\(?(\\d{3})?\\)?(-| )?\\d{3}(-| )?\\d{4}"))
phone
## [1] "555-1239" "(636) 555-0113" "555-6542" "555 8904"
## [5] "636-555-3226" "5553642"
The raw data has titles, middle names, dots et cetera. In addition to that, some people have their first name and last name reversed. So the first thing I would get started is to clean the data first. Then I created a data frame so I can enter the data into the data frame which has only two coloumns: first_name and last_name.
#remove all titles
n <- str_replace(name, pattern = "Rev. |Dr. ", replacement = "")
#Exchange the position of first name and last name on few people who have them reversed.
for (i in 1:length(n))
{
if (str_detect(n[i], ","))
{
temp <- str_extract(n[i], "[[:space:]][[:alpha:]+]{2,}")
n[i]<- str_c(temp, str_extract(n[i], "[[:alpha:]+]{2,}"), sep = " ")
}
}
n
## [1] "Moe Szyslak" " Montgomery Burns" "Timothy Lovejoy"
## [4] "Ned Flanders" " Homer Simpson" "Julius Hibbert"
#After all, enter everything into a newly created data frame.
first_name <- str_extract(n, "[[:alpha:]+]{2,}")
last_name <- str_extract(n, "[[:space:]][[:alpha:]+]{2,}")
n <- data.frame(first_name, last_name)
n
## first_name last_name
## 1 Moe Szyslak
## 2 Montgomery Montgomery
## 3 Timothy Lovejoy
## 4 Ned Flanders
## 5 Homer Homer
## 6 Julius Hibbert
str_detect(name, "[[:alpha:]+]{2,3}[.]")
## [1] FALSE FALSE TRUE FALSE FALSE TRUE
str_detect(name, "[A-Z]\\.")
## [1] FALSE TRUE FALSE FALSE FALSE FALSE
a <- c("123a", "456b789", "c01$")
unlist(str_extract_all(a, "[0-9]+\\$"))
## [1] "01$"
b <- c("abcdefg", "hijk", "lmn", "opqrst", "uvw", "xyz")
unlist(str_extract_all(b, "\\b[a-z]{1,4}\\b"))
## [1] "hijk" "lmn" "uvw" "xyz"
c <- c("abcd", "txt", "efg.txt", "hij.txtklmn")
unlist(str_extract_all(c, ".*?\\.txt$"))
## [1] "efg.txt"
d <- c("ab12cd34", "56/78/2016", "123/456/789", "ab12/34/5678cd")
unlist(str_extract_all(d, "\\d{2}/\\d{2}/\\d{4}"))
## [1] "56/78/2016" "12/34/5678"
e <- c("<head>hello</head>", "<body>world</body>", "<>happy<>")
unlist(str_extract_all(e, "<(.+?)>.+?</\\1>"))
## [1] "<head>hello</head>" "<body>world</body>"
a <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
a <-unlist(str_extract_all(a, "[[:upper:].]{1,}"))
a <- paste(a, collapse = "")
a <- str_replace_all(a, pattern = "\\.", replacement = " ")
a
## [1] "CONGRATULATIONS YOU ARE A SUPERNERD"